invoice_reader.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. # coding:utf-8
  2. import fitz
  3. import os
  4. import re
  5. file = fitz.Document('MOZAIK CONSULTING INV0001145.PDF')
  6. cnt = file.loadPage(0)
  7. # head = cnt.searchFor('Ln')
  8. # head2 = cnt.searchFor('Price (€)')
  9. head = re.search('Ln', cnt.getText('text'))
  10. head2 = re.search('Price \(€\)', cnt.getText('text'))
  11. foot = re.search('Total Net Value \(€\)', cnt.getText('text'))
  12. inv_nr = re.search('(?<=INV)[0-9$]+', cnt.getText('text'))
  13. print(head, head2, foot)
  14. # rect = head[0] | head2[0]
  15. # my_word = [w for w in cnt.getTextWords() if fitz.Rect(w[:4]) in rect]
  16. # # print(my_word)
  17. # headers = []
  18. #
  19. # for header_word in my_word:
  20. # headers.append(header_word[4])
  21. #
  22. # print(headers)
  23. text = cnt.getText('text')
  24. print(text[991:2521].split(3 * ' ' + '\n')[0].split(1 * ' ' + '\n'))
  25. print(text[head2.end() + 1: foot.start()].split(3 * ' ' + '\n'),
  26. len(text[991:2521].split(3 * ' ' + '\n')) - 1)
  27. def search_text(page, word):
  28. found = 0
  29. wlist = page.getTextWords()
  30. for w in wlist:
  31. if word in w[4]:
  32. head = w
  33. found += 1
  34. return found, wlist.index(head)
  35. # print(search_text(cnt, 'Ln'))