brother.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. # -*- coding: utf-8 -*-
  2. '''
  3. Created on Nov 18, 2019 @author: levente.marton
  4. '''
  5. import fitz
  6. import re
  7. class itemList(object):
  8. def __init__(self, item_list, jump):
  9. self.item_list = item_list
  10. self.jump = jump
  11. to_del = self.item_list.index('')
  12. del(self.item_list[to_del])
  13. self.codes = []
  14. for c in range(0, len(self.item_list), self.jump):
  15. if ' ' in self.item_list[c].strip():
  16. code = self.item_list[c + 1].strip()
  17. else:
  18. code = self.item_list[c].strip()
  19. self.codes.append(code)
  20. self._cleanup(self.codes)
  21. self.qnt = []
  22. for c in range(1, len(self.item_list), self.jump):
  23. # q = self.item_list[c].strip()
  24. if len(self.item_list[c].strip()) > 3:
  25. q = self.item_list[c + 1].strip()
  26. else:
  27. q = self.item_list[c].strip()
  28. self.qnt.append(q)
  29. self._cleanup(self.qnt)
  30. self.net_val = []
  31. for c in range(2, len(self.item_list), self.jump):
  32. try:
  33. if ',' not in self.item_list[c].strip():
  34. val = self.item_list[c + 1].strip().replace(',', '.')
  35. else:
  36. val = self.item_list[c].strip().replace(',', '.')
  37. except IndexError:
  38. val = self.item_list[c].strip().replace(',', '.')
  39. self.net_val.append(val)
  40. self._cleanup(self.net_val)
  41. self.disc = []
  42. torem = '- %'
  43. for c in range(3, len(self.item_list), self.jump):
  44. d = self.item_list[c].strip()
  45. if d == '':
  46. d = '0'
  47. self.disc.append(d)
  48. else:
  49. self.disc.append('-' + d.replace(torem, ''))
  50. # self._cleanup(self.disc)
  51. def _cleanup(self, _list):
  52. if _list[-1] == '':
  53. del(_list[-1])
  54. def __getitem__(self, k):
  55. return self.__dict__[k]
  56. class brotherInvoice(object):
  57. '''
  58. classdocs
  59. '''
  60. def __init__(self, invoice):
  61. '''
  62. Constructor
  63. '''
  64. self.doc = fitz.Document(invoice)
  65. pages = ()
  66. for p in range(len(self.doc)):
  67. page = self.doc.loadPage(p)
  68. pages += (page,)
  69. self.pages = pages
  70. self.fpage = self.doc.loadPage(0)
  71. # self.spage = self.doc.loadPage(1).getText('text')
  72. self.cnt = self.fpage.getText('text')
  73. if re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt):
  74. self.items_start = re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt).end()
  75. else:
  76. self.items_start = 0
  77. self.fitems_end = re.search(' \nInvoice Address:', self.cnt)
  78. self.lpage = self.doc.loadPage(len(self.doc) - 1)
  79. self.cnt2 = '\n' + self.lpage.getText('text')
  80. self.items_end = re.search('Goods Value', self.cnt2)
  81. items_area = ''
  82. for _ in range(len(self.doc) - 1):
  83. if _ == 0:
  84. items_area += self.cnt[self.items_start:self.fitems_end.start()-1] + 2 * '\n'
  85. else:
  86. p = self.doc.loadPage(_).getText('text')
  87. items_area += '\n' + p
  88. items_area += self.cnt2[:self.items_end.start()]
  89. self.items_area = self.cleanup(items_area).split('\n')
  90. self.items_details = itemList(self.items_area, 10)
  91. self.size = len(self.items_details.codes)
  92. if re.search('(?<=Invoice No. \n)[0-9.0-9.0-9]+', self.cnt):
  93. self.data = re.search('(?<=Invoice No. \n)[0-9.0-9.0-9]+', self.cnt).group(0)
  94. else:
  95. self.data = re.search('(?<=[0-9]{9}\n)[0-9.0-9.0-9]+', self.cnt).group(0)
  96. self.inv_nr = re.search('(?<={}\n)[0-9]+'.format(self.data), self.cnt).group(0)
  97. self.vat_num = re.search('(?<=Umsatzsteuer ID Nr. )[A-Z0-9]+', self.cnt).group(0)
  98. def __getitem__(self, k):
  99. return self.__dict__[k]
  100. def __len__(self):
  101. return self.size
  102. def cleanup(self, text):
  103. # start = re.finditer('Delivery Note:', text)
  104. start = len(self.doc)
  105. # end = re.finditer('Delivery Date: [0-9.0-9.0-9]+', text)
  106. # count = tuple(start)
  107. for _ in range(start):
  108. st = re.search('Delivery Note:', text)
  109. en = re.search('Delivery Date: [0-9.0-9.0-9]+', text)
  110. st_page = re.search('Brother Internationale', text)
  111. en_page = re.search('Original', text)
  112. block = None
  113. if st and en:
  114. block = text[st.start():en.end()+1]
  115. block2 = None
  116. if st_page and en_page:
  117. block2 = text[st_page.start()-1:en_page.end()]
  118. obs = re.search('INCL.ALL PLATEN INSERTS', text)
  119. obs2 = re.search('Customer Mat No.*\n \n', text)
  120. # TO DO find Cust.Mat.No.: and clear
  121. obs3 = re.search('Cust\.Mat\.No.*\n \n \n', text)
  122. if obs2:
  123. obs2 = obs2.group(0)
  124. if obs3:
  125. obs3 = obs3.group(0)
  126. if obs:
  127. obs = obs.group(0)
  128. if st and en:
  129. text = text.replace(block, '')
  130. if obs:
  131. text = text.replace(obs, '')
  132. if obs2:
  133. text = text.replace(obs2, '')
  134. if obs3:
  135. text = text.replace(obs3, '')
  136. if block2:
  137. text = text.replace(block2, '')
  138. return text
  139. class brotherOrder(object):
  140. def __init__(self, order):
  141. self.doc = fitz.Document(order)
  142. pages = ()
  143. for p in range(len(self.doc)):
  144. page = self.doc.loadPage(p)
  145. pages += (page,)
  146. self.pages = pages
  147. self.fpage = self.doc.loadPage(0)
  148. # self.spage = self.doc.loadPage(1).getText('text')
  149. self.cnt = self.fpage.getText('text')
  150. if re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt):
  151. self.items_start = re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt).end()
  152. else:
  153. self.items_start = 0
  154. self.fitems_end = re.search('\nInvoice Address:', self.cnt)
  155. self.lpage = self.doc.loadPage(len(self.doc)-1)
  156. self.cnt2 = self.lpage.getText('text')
  157. self.items_end = re.search('Goods Value', self.cnt2)
  158. items_area = ''
  159. for _ in range(len(self.doc)-1):
  160. if _ == 0:
  161. items_area += self.cnt[self.items_start:self.fitems_end.start()-1] + 2 * '\n'
  162. else:
  163. p = self.doc.loadPage(_).getText('text')
  164. items_area += '\n' + p
  165. items_area += self.cnt2[:self.items_end.start()]
  166. self.items_area = self.cleanup(items_area).split('\n')
  167. self.items_details = itemList(self.items_area, 9)
  168. def cleanup(self, text):
  169. # start = re.finditer('Delivery Note:', text)
  170. start = len(self.doc)
  171. # end = re.finditer('Delivery Date: [0-9.0-9.0-9]+', text)
  172. # count = tuple(start)
  173. for _ in range(start):
  174. st = re.search('Delivery Note:', text)
  175. en = re.search('Delivery Date: [0-9.0-9.0-9]+', text)
  176. st_page = re.search('Brother Internationale', text)
  177. en_page = re.search('Original', text)
  178. block = None
  179. if st and en:
  180. block = text[st.start():en.end()+1]
  181. block2 = None
  182. if st_page and en_page:
  183. block2 = text[st_page.start()-1:en_page.end()]
  184. obs = re.search('INCL.ALL PLATEN INSERTS', text)
  185. obs2 = re.search('Customer Mat No.*\n \n', text)
  186. # TO DO find Cust.Mat.No.: and clear
  187. obs3 = re.search('Cust\.Mat\.No.*\n \n \n', text)
  188. del_date = re.search('Cust.Mat.No.:[ 0-9A-Za-z]*[\n]{1}', text)
  189. if obs2:
  190. obs2 = obs2.group(0)
  191. if obs3:
  192. obs3 = obs3.group(0)
  193. if obs:
  194. obs = obs.group(0)
  195. if del_date:
  196. del_date = del_date.group(0)
  197. if st and en:
  198. text = text.replace(block, '')
  199. if obs:
  200. text = text.replace(obs, '')
  201. if obs2:
  202. text = text.replace(obs2, '')
  203. if obs3:
  204. text = text.replace(obs3, '')
  205. if del_date:
  206. text = text.replace(del_date, '')
  207. if block2:
  208. text = text.replace(block2, '')
  209. return text
  210. if __name__ == '__main__':
  211. order = brotherOrder('pdf/ALTMP_P10_27.pdf')
  212. # inv = brotherInvoice('sample/3140149557.pdf')
  213. # for i in order.items_area:
  214. # print(i)
  215. print(order.items_area)
  216. # print(order.items_details.net_val)
  217. # print(order.items_details.qnt)
  218. # print(order.items_details.codes)
  219. # for i in order.items_area:
  220. # print(i)
  221. # for o in order.items_area:
  222. # print(o)
  223. # print(len(inv.items_area))
  224. # print(ord.items_details.codes)
  225. # for i in inv.items_area:
  226. # print(i)