ocr.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. # coding: utf-8
  2. from PIL import Image
  3. import pytesseract
  4. import re
  5. import argparse
  6. import os
  7. import cv2
  8. img_files = [f for f in os.listdir() if f.endswith('.jpg')]
  9. img_files.sort(key=lambda x: os.stat(os.path.join(x)).st_mtime, reverse=True)
  10. parser = argparse.ArgumentParser()
  11. parser.add_argument('-file', type=str, help='name of the scanned file to be converted',
  12. default=img_files[0])
  13. parser.add_argument('-p', '--preprocess', type=str, default=None,
  14. help='type of preprocessing to be done')
  15. args = parser.parse_args()
  16. image = cv2.imread(args.file)
  17. gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  18. if args.preprocess == 'thresh':
  19. gray = cv2.threshold(gray, 0, 255,
  20. cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
  21. elif args.preprocess == 'blur':
  22. gray = cv2.medianBlur(gray, 3)
  23. new_im = cv2.imwrite('new_cv_image.jpg', gray)
  24. # image = Image.open(args.file)
  25. pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
  26. print('tesseract version {}'.format(pytesseract.get_tesseract_version()))
  27. print('processing', args.file)
  28. with open('string.txt', 'w') as str_file:
  29. str_file.writelines(re.sub(' {1,5}(?![a-zA-Z0-9])', '', pytesseract.image_to_string(Image.open('new_cv_image.jpg'), lang='eng')))
  30. with open('string.txt', 'r') as str_file, open('stripped_string.txt', 'w+') as str_out_file:
  31. for empty_line in str_file.readlines():
  32. if not empty_line.strip(): continue
  33. str_out_file.writelines(empty_line)
  34. pdf = pytesseract.image_to_pdf_or_hocr(Image.open('new_cv_image.jpg'), extension='pdf')
  35. with open('new_cv_image.pdf', 'w+b') as pdf_file:
  36. pdf_file.write(bytearray(pdf))