# coding:utf-8 import fitz import os import re file = fitz.Document('MOZAIK CONSULTING INV0001145.PDF') cnt = file.loadPage(0) # head = cnt.searchFor('Ln') # head2 = cnt.searchFor('Price (€)') head = re.search('Ln', cnt.getText('text')) head2 = re.search('Price \(€\)', cnt.getText('text')) foot = re.search('Total Net Value \(€\)', cnt.getText('text')) inv_nr = re.search('(?<=INV)[0-9$]+', cnt.getText('text')) print(head, head2, foot) # rect = head[0] | head2[0] # my_word = [w for w in cnt.getTextWords() if fitz.Rect(w[:4]) in rect] # # print(my_word) # headers = [] # # for header_word in my_word: # headers.append(header_word[4]) # # print(headers) text = cnt.getText('text') print(text[991:2521].split(3 * ' ' + '\n')[0].split(1 * ' ' + '\n')) print(text[head2.end() + 1: foot.start()].split(3 * ' ' + '\n'), len(text[991:2521].split(3 * ' ' + '\n')) - 1) def search_text(page, word): found = 0 wlist = page.getTextWords() for w in wlist: if word in w[4]: head = w found += 1 return found, wlist.index(head) # print(search_text(cnt, 'Ln'))