ocr.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. # coding: utf-8
  2. import re
  3. import argparse
  4. import os
  5. import pytesseract
  6. import cv2
  7. from PIL import Image, ImageOps, ImageEnhance
  8. img_files = [f for f in os.listdir() if f.endswith('.png')]
  9. img_files.sort(key=lambda x: os.stat(os.path.join(x)).st_mtime, reverse=True)
  10. parser = argparse.ArgumentParser()
  11. parser.add_argument('-file', type=str, help='name of the scanned file to be converted',
  12. default=img_files[0])
  13. parser.add_argument('-p', '--preprocess', type=str, default=None,
  14. help='type of preprocessing to be done')
  15. args = parser.parse_args()
  16. image = cv2.imread(args.file)
  17. gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  18. if args.preprocess == 'thresh':
  19. gray = cv2.threshold(gray, 0, 255,
  20. cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
  21. elif args.preprocess == 'blur':
  22. gray = cv2.medianBlur(gray, 3)
  23. new_im = cv2.imwrite('new_cv_image.jpg', gray)
  24. # image = Image.open(args.file)
  25. pytesseract.pytesseract.tesseract_cmd = r'c:/Users/Levi/appData/Local/Tesseract-OCR/tesseract.exe'
  26. print('tesseract version {}'.format(pytesseract.get_tesseract_version()))
  27. print('processing', args.file)
  28. with open('string.txt', 'w') as str_file:
  29. str_file.writelines(re.sub(' {1,5}(?![a-zA-Z0-9])', '', pytesseract.image_to_string(Image.open('new_cv_image.jpg'))))
  30. with open('string.txt', 'r') as str_file, open('stripped_string.txt', 'w+') as str_out_file:
  31. for empty_line in str_file.readlines():
  32. if not empty_line.strip(): continue
  33. str_out_file.writelines(empty_line)
  34. pdf = pytesseract.image_to_pdf_or_hocr(Image.open('new_cv_image.jpg'), extension='pdf')
  35. with open('new_cv_image.pdf', 'w+b') as pdf_file:
  36. pdf_file.write(bytearray(pdf))