123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- # coding:utf-8
- import fitz
- import os
- import re
- file = fitz.Document('MOZAIK CONSULTING INV0001145.PDF')
- cnt = file.loadPage(0)
- # head = cnt.searchFor('Ln')
- # head2 = cnt.searchFor('Price (€)')
- head = re.search('Ln', cnt.getText('text'))
- head2 = re.search('Price \(€\)', cnt.getText('text'))
- foot = re.search('Total Net Value \(€\)', cnt.getText('text'))
- inv_nr = re.search('(?<=INV)[0-9$]+', cnt.getText('text'))
- print(head, head2, foot)
- # rect = head[0] | head2[0]
- # my_word = [w for w in cnt.getTextWords() if fitz.Rect(w[:4]) in rect]
- # # print(my_word)
- # headers = []
- #
- # for header_word in my_word:
- # headers.append(header_word[4])
- #
- # print(headers)
- text = cnt.getText('text')
- print(text[991:2521].split(3 * ' ' + '\n')[0].split(1 * ' ' + '\n'))
- print(text[head2.end() + 1: foot.start()].split(3 * ' ' + '\n'),
- len(text[991:2521].split(3 * ' ' + '\n')) - 1)
- def search_text(page, word):
- found = 0
- wlist = page.getTextWords()
- for w in wlist:
- if word in w[4]:
- head = w
- found += 1
- return found, wlist.index(head)
- # print(search_text(cnt, 'Ln'))
|