# -*- coding: utf-8 -*- ''' Created on Nov 18, 2019 @author: levente.marton ''' import fitz import re class itemList(object): def __init__(self, item_list, jump): self.item_list = item_list self.jump = jump to_del = self.item_list.index('') del(self.item_list[to_del]) self.codes = [] for c in range(0, len(self.item_list), self.jump): if ' ' in self.item_list[c].strip(): code = self.item_list[c + 1].strip() else: code = self.item_list[c].strip() self.codes.append(code) self._cleanup(self.codes) self.qnt = [] for c in range(1, len(self.item_list), self.jump): # q = self.item_list[c].strip() if len(self.item_list[c].strip()) > 3: q = self.item_list[c + 1].strip() else: q = self.item_list[c].strip() self.qnt.append(q) self._cleanup(self.qnt) self.net_val = [] for c in range(2, len(self.item_list), self.jump): try: if ',' not in self.item_list[c].strip(): val = self.item_list[c + 1].strip().replace(',', '.') else: val = self.item_list[c].strip().replace(',', '.') except IndexError: val = self.item_list[c].strip().replace(',', '.') self.net_val.append(val) self._cleanup(self.net_val) self.disc = [] torem = '- %' for c in range(3, len(self.item_list), self.jump): d = self.item_list[c].strip() if d == '': d = '0' self.disc.append(d) else: self.disc.append('-' + d.replace(torem, '')) # self._cleanup(self.disc) def _cleanup(self, _list): if _list[-1] == '': del(_list[-1]) def __getitem__(self, k): return self.__dict__[k] class brotherInvoice(object): ''' classdocs ''' def __init__(self, invoice): ''' Constructor ''' self.doc = fitz.Document(invoice) pages = () for p in range(len(self.doc)): page = self.doc.loadPage(p) pages += (page,) self.pages = pages self.fpage = self.doc.loadPage(0) # self.spage = self.doc.loadPage(1).getText('text') self.cnt = self.fpage.getText('text') if re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt): self.items_start = re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt).end() else: self.items_start = 0 self.fitems_end = re.search(' \nInvoice Address:', self.cnt) self.lpage = self.doc.loadPage(len(self.doc) - 1) self.cnt2 = '\n' + self.lpage.getText('text') self.items_end = re.search('Goods Value', self.cnt2) items_area = '' for _ in range(len(self.doc) - 1): if _ == 0: items_area += self.cnt[self.items_start:self.fitems_end.start()-1] + 2 * '\n' else: p = self.doc.loadPage(_).getText('text') items_area += '\n' + p items_area += self.cnt2[:self.items_end.start()] self.items_area = self.cleanup(items_area).split('\n') self.items_details = itemList(self.items_area, 10) self.size = len(self.items_details.codes) if re.search('(?<=Invoice No. \n)[0-9.0-9.0-9]+', self.cnt): self.data = re.search('(?<=Invoice No. \n)[0-9.0-9.0-9]+', self.cnt).group(0) else: self.data = re.search('(?<=[0-9]{9}\n)[0-9.0-9.0-9]+', self.cnt).group(0) self.inv_nr = re.search('(?<={}\n)[0-9]+'.format(self.data), self.cnt).group(0) self.vat_num = re.search('(?<=Umsatzsteuer ID Nr. )[A-Z0-9]+', self.cnt).group(0) def __getitem__(self, k): return self.__dict__[k] def __len__(self): return self.size def cleanup(self, text): # start = re.finditer('Delivery Note:', text) start = len(self.doc) # end = re.finditer('Delivery Date: [0-9.0-9.0-9]+', text) # count = tuple(start) for _ in range(start): st = re.search('Delivery Note:', text) en = re.search('Delivery Date: [0-9.0-9.0-9]+', text) st_page = re.search('Brother Internationale', text) en_page = re.search('Original', text) block = None if st and en: block = text[st.start():en.end()+1] block2 = None if st_page and en_page: block2 = text[st_page.start()-1:en_page.end()] obs = re.search('INCL.ALL PLATEN INSERTS', text) obs2 = re.search('Customer Mat No.*\n \n', text) # TO DO find Cust.Mat.No.: and clear obs3 = re.search('Cust\.Mat\.No.*\n \n \n', text) if obs2: obs2 = obs2.group(0) if obs3: obs3 = obs3.group(0) if obs: obs = obs.group(0) if st and en: text = text.replace(block, '') if obs: text = text.replace(obs, '') if obs2: text = text.replace(obs2, '') if obs3: text = text.replace(obs3, '') if block2: text = text.replace(block2, '') return text class brotherOrder(object): def __init__(self, order): self.doc = fitz.Document(order) pages = () for p in range(len(self.doc)): page = self.doc.loadPage(p) pages += (page,) self.pages = pages self.fpage = self.doc.loadPage(0) # self.spage = self.doc.loadPage(1).getText('text') self.cnt = self.fpage.getText('text') if re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt): self.items_start = re.search('Delivery Date: [0-9.0-9.0-9]+', self.cnt).end() else: self.items_start = 0 self.fitems_end = re.search('\nInvoice Address:', self.cnt) self.lpage = self.doc.loadPage(len(self.doc)-1) self.cnt2 = self.lpage.getText('text') self.items_end = re.search('Goods Value', self.cnt2) items_area = '' for _ in range(len(self.doc)-1): if _ == 0: items_area += self.cnt[self.items_start:self.fitems_end.start()-1] + 2 * '\n' else: p = self.doc.loadPage(_).getText('text') items_area += '\n' + p items_area += self.cnt2[:self.items_end.start()] self.items_area = self.cleanup(items_area).split('\n') self.items_details = itemList(self.items_area, 9) def cleanup(self, text): # start = re.finditer('Delivery Note:', text) start = len(self.doc) # end = re.finditer('Delivery Date: [0-9.0-9.0-9]+', text) # count = tuple(start) for _ in range(start): st = re.search('Delivery Note:', text) en = re.search('Delivery Date: [0-9.0-9.0-9]+', text) st_page = re.search('Brother Internationale', text) en_page = re.search('Original', text) block = None if st and en: block = text[st.start():en.end()+1] block2 = None if st_page and en_page: block2 = text[st_page.start()-1:en_page.end()] obs = re.search('INCL.ALL PLATEN INSERTS', text) obs2 = re.search('Customer Mat No.*\n \n', text) # TO DO find Cust.Mat.No.: and clear obs3 = re.search('Cust\.Mat\.No.*\n \n \n', text) del_date = re.search('Cust.Mat.No.:[ 0-9A-Za-z]*[\n]{1}', text) if obs2: obs2 = obs2.group(0) if obs3: obs3 = obs3.group(0) if obs: obs = obs.group(0) if del_date: del_date = del_date.group(0) if st and en: text = text.replace(block, '') if obs: text = text.replace(obs, '') if obs2: text = text.replace(obs2, '') if obs3: text = text.replace(obs3, '') if del_date: text = text.replace(del_date, '') if block2: text = text.replace(block2, '') return text if __name__ == '__main__': order = brotherOrder('pdf/ALTMP_P10_27.pdf') # inv = brotherInvoice('sample/3140149557.pdf') # for i in order.items_area: # print(i) print(order.items_area) # print(order.items_details.net_val) # print(order.items_details.qnt) # print(order.items_details.codes) # for i in order.items_area: # print(i) # for o in order.items_area: # print(o) # print(len(inv.items_area)) # print(ord.items_details.codes) # for i in inv.items_area: # print(i)