# coding: utf-8 from PIL import Image import pytesseract import re import argparse import os import cv2 img_files = [f for f in os.listdir() if f.endswith('.jpg')] img_files.sort(key=lambda x: os.stat(os.path.join(x)).st_mtime, reverse=True) parser = argparse.ArgumentParser() parser.add_argument('-file', type=str, help='name of the scanned file to be converted', default=img_files[0]) parser.add_argument('-p', '--preprocess', type=str, default=None, help='type of preprocessing to be done') args = parser.parse_args() image = cv2.imread(args.file) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if args.preprocess == 'thresh': gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] elif args.preprocess == 'blur': gray = cv2.medianBlur(gray, 3) new_im = cv2.imwrite('new_cv_image.jpg', gray) # image = Image.open(args.file) pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe' print('tesseract version {}'.format(pytesseract.get_tesseract_version())) print('processing', args.file) with open('string.txt', 'w') as str_file: str_file.writelines(re.sub(' {1,5}(?![a-zA-Z0-9])', '', pytesseract.image_to_string(Image.open('new_cv_image.jpg'), lang='eng'))) with open('string.txt', 'r') as str_file, open('stripped_string.txt', 'w+') as str_out_file: for empty_line in str_file.readlines(): if not empty_line.strip(): continue str_out_file.writelines(empty_line) pdf = pytesseract.image_to_pdf_or_hocr(Image.open('new_cv_image.jpg'), extension='pdf') with open('new_cv_image.pdf', 'w+b') as pdf_file: pdf_file.write(bytearray(pdf))