123456789101112131415161718192021222324252627282930313233343536373839404142 |
- # coding: utf-8
- import re
- import argparse
- import os
- import pytesseract
- import cv2
- from PIL import Image, ImageOps, ImageEnhance
- img_files = [f for f in os.listdir() if f.endswith('.png')]
- img_files.sort(key=lambda x: os.stat(os.path.join(x)).st_mtime, reverse=True)
- parser = argparse.ArgumentParser()
- parser.add_argument('-file', type=str, help='name of the scanned file to be converted',
- default=img_files[0])
- parser.add_argument('-p', '--preprocess', type=str, default=None,
- help='type of preprocessing to be done')
- args = parser.parse_args()
- image = cv2.imread(args.file)
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
- if args.preprocess == 'thresh':
- gray = cv2.threshold(gray, 0, 255,
- cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
- elif args.preprocess == 'blur':
- gray = cv2.medianBlur(gray, 3)
- new_im = cv2.imwrite('new_cv_image.jpg', gray)
- # image = Image.open(args.file)
- pytesseract.pytesseract.tesseract_cmd = r'c:/Users/Levi/appData/Local/Tesseract-OCR/tesseract.exe'
- print('tesseract version {}'.format(pytesseract.get_tesseract_version()))
- print('processing', args.file)
- with open('string.txt', 'w') as str_file:
- str_file.writelines(re.sub(' {1,5}(?![a-zA-Z0-9])', '', pytesseract.image_to_string(Image.open('new_cv_image.jpg'))))
- with open('string.txt', 'r') as str_file, open('stripped_string.txt', 'w+') as str_out_file:
- for empty_line in str_file.readlines():
- if not empty_line.strip(): continue
- str_out_file.writelines(empty_line)
- pdf = pytesseract.image_to_pdf_or_hocr(Image.open('new_cv_image.jpg'), extension='pdf')
- with open('new_cv_image.pdf', 'w+b') as pdf_file:
- pdf_file.write(bytearray(pdf))
|