123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241 |
- # -*- coding: utf-8 -*-
- '''
- Created on Nov 18, 2019 @author: levente.marton
- '''
- import fitz
- import re
- class itemList(object):
- def __init__(self, item_list, jump):
- self.item_list = item_list
- self.jump = jump
- to_del = self.item_list.index('')
- del(self.item_list[to_del])
- self.codes = []
- for c in range(0, len(self.item_list), self.jump):
- if ' ' in self.item_list[c].strip():
- code = self.item_list[c + 1].strip()
- else:
- code = self.item_list[c].strip()
- self.codes.append(code)
- self._cleanup(self.codes)
- self.qnt = []
- for c in range(1, len(self.item_list), self.jump):
- # q = self.item_list[c].strip()
- if len(self.item_list[c].strip()) > 3:
- q = self.item_list[c + 1].strip()
- else:
- q = self.item_list[c].strip()
- self.qnt.append(q)
- self._cleanup(self.qnt)
- self.net_val = []
- for c in range(2, len(self.item_list), self.jump):
- try:
- if ',' not in self.item_list[c].strip():
- val = self.item_list[c + 1].strip().replace(',', '.')
- else:
- val = self.item_list[c].strip().replace(',', '.')
- except IndexError:
- val = self.item_list[c].strip().replace(',', '.')
- self.net_val.append(val)
- self._cleanup(self.net_val)
- self.disc = []
- torem = '- %'
- for c in range(3, len(self.item_list), self.jump):
- d = self.item_list[c].strip()
- if d == '':
- d = '0'
- self.disc.append(d)
- else:
- self.disc.append('-' + d.replace(torem, ''))
- # self._cleanup(self.disc)
- def _cleanup(self, _list):
- if _list[-1] == '':
- del(_list[-1])
- def __getitem__(self, k):
- return self.__dict__[k]
- class brotherInvoice(object):
- '''
- classdocs
- '''
- def __init__(self, invoice):
- '''
- Constructor
- '''
- self.doc = fitz.Document(invoice)
- pages = ()
- for p in range(len(self.doc)):
- page = self.doc.loadPage(p)
- pages += (page,)
- self.pages = pages
- self.fpage = self.doc.loadPage(0)
- # self.spage = self.doc.loadPage(1).getText('text')
- self.cnt = self.fpage.getText('text')
- if re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt):
- self.items_start = re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt).end()
- else:
- self.items_start = 0
- self.fitems_end = re.search(' \nInvoice Address:', self.cnt)
- self.lpage = self.doc.loadPage(len(self.doc) - 1)
- self.cnt2 = '\n' + self.lpage.getText('text')
- self.items_end = re.search('Goods Value', self.cnt2)
- items_area = ''
- for _ in range(len(self.doc) - 1):
- if _ == 0:
- items_area += self.cnt[self.items_start:self.fitems_end.start()-1] + 2 * '\n'
- else:
- p = self.doc.loadPage(_).getText('text')
- items_area += '\n' + p
- items_area += self.cnt2[:self.items_end.start()]
- self.items_area = self.cleanup(items_area).split('\n')
- self.items_details = itemList(self.items_area, 10)
- self.size = len(self.items_details.codes)
- if re.search('(?<=Invoice No. \n)[0-9.0-9.0-9]+', self.cnt):
- self.data = re.search('(?<=Invoice No. \n)[0-9.0-9.0-9]+', self.cnt).group(0)
- else:
- self.data = re.search('(?<=[0-9]{9}\n)[0-9.0-9.0-9]+', self.cnt).group(0)
- self.inv_nr = re.search('(?<={}\n)[0-9]+'.format(self.data), self.cnt).group(0)
- self.vat_num = re.search('(?<=Umsatzsteuer ID Nr. )[A-Z0-9]+', self.cnt).group(0)
- def __getitem__(self, k):
- return self.__dict__[k]
- def __len__(self):
- return self.size
- def cleanup(self, text):
- # start = re.finditer('Delivery Note:', text)
- start = len(self.doc)
- # end = re.finditer('Delivery Date: [0-9.0-9.0-9]+', text)
- # count = tuple(start)
- for _ in range(start):
- st = re.search('Delivery Note:', text)
- en = re.search('Delivery Date: [0-9.0-9.0-9]+', text)
- st_page = re.search('Brother Internationale', text)
- en_page = re.search('Original', text)
- block = None
- if st and en:
- block = text[st.start():en.end()+1]
- block2 = None
- if st_page and en_page:
- block2 = text[st_page.start()-1:en_page.end()]
- obs = re.search('INCL.ALL PLATEN INSERTS', text)
- obs2 = re.search('Customer Mat No.*\n \n', text)
- # TO DO find Cust.Mat.No.: and clear
- obs3 = re.search('Cust\.Mat\.No.*\n \n \n', text)
- if obs2:
- obs2 = obs2.group(0)
- if obs3:
- obs3 = obs3.group(0)
- if obs:
- obs = obs.group(0)
- if st and en:
- text = text.replace(block, '')
- if obs:
- text = text.replace(obs, '')
- if obs2:
- text = text.replace(obs2, '')
- if obs3:
- text = text.replace(obs3, '')
- if block2:
- text = text.replace(block2, '')
- return text
- class brotherOrder(object):
- def __init__(self, order):
- self.doc = fitz.Document(order)
- pages = ()
- for p in range(len(self.doc)):
- page = self.doc.loadPage(p)
- pages += (page,)
- self.pages = pages
- self.fpage = self.doc.loadPage(0)
- # self.spage = self.doc.loadPage(1).getText('text')
- self.cnt = self.fpage.getText('text')
- if re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt):
- self.items_start = re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt).end()
- else:
- self.items_start = 0
- self.fitems_end = re.search('\nInvoice Address:', self.cnt)
- self.lpage = self.doc.loadPage(len(self.doc)-1)
- self.cnt2 = self.lpage.getText('text')
- self.items_end = re.search('Goods Value', self.cnt2)
- items_area = ''
- for _ in range(len(self.doc)-1):
- if _ == 0:
- items_area += self.cnt[self.items_start:self.fitems_end.start()-1] + 2 * '\n'
- else:
- p = self.doc.loadPage(_).getText('text')
- items_area += '\n' + p
- items_area += self.cnt2[:self.items_end.start()]
- self.items_area = self.cleanup(items_area).split('\n')
- self.items_details = itemList(self.items_area, 9)
- def cleanup(self, text):
- # start = re.finditer('Delivery Note:', text)
- start = len(self.doc)
- # end = re.finditer('Delivery Date: [0-9.0-9.0-9]+', text)
- # count = tuple(start)
- for _ in range(start):
- st = re.search('Delivery Note:', text)
- en = re.search('Delivery Date: [0-9.0-9.0-9]+', text)
- st_page = re.search('Brother Internationale', text)
- en_page = re.search('Original', text)
- block = None
- if st and en:
- block = text[st.start():en.end()+1]
- block2 = None
- if st_page and en_page:
- block2 = text[st_page.start()-1:en_page.end()]
- obs = re.search('INCL.ALL PLATEN INSERTS', text)
- obs2 = re.search('Customer Mat No.*\n \n', text)
- # TO DO find Cust.Mat.No.: and clear
- obs3 = re.search('Cust\.Mat\.No.*\n \n \n', text)
- del_date = re.search('Cust.Mat.No.:[ 0-9A-Za-z]*[\n]{1}', text)
- if obs2:
- obs2 = obs2.group(0)
- if obs3:
- obs3 = obs3.group(0)
- if obs:
- obs = obs.group(0)
- if del_date:
- del_date = del_date.group(0)
- if st and en:
- text = text.replace(block, '')
- if obs:
- text = text.replace(obs, '')
- if obs2:
- text = text.replace(obs2, '')
- if obs3:
- text = text.replace(obs3, '')
- if del_date:
- text = text.replace(del_date, '')
- if block2:
- text = text.replace(block2, '')
- return text
- if __name__ == '__main__':
- order = brotherOrder('pdf/ALTMP_P10_27.pdf')
- # inv = brotherInvoice('sample/3140149557.pdf')
- # for i in order.items_area:
- # print(i)
- print(order.items_area)
- # print(order.items_details.net_val)
- # print(order.items_details.qnt)
- # print(order.items_details.codes)
- # for i in order.items_area:
- # print(i)
- # for o in order.items_area:
- # print(o)
- # print(len(inv.items_area))
- # print(ord.items_details.codes)
- # for i in inv.items_area:
- # print(i)
|