import os import re import urllib3 from datetime import date, datetime, timedelta from urllib3.exceptions import InsecureRequestWarning from urllib3.util.retry import Retry # from oauth2client.service_account import ServiceAccountCredentials import pypdf import gspread import requests import xlsxwriter from gspread import BackOffHTTPClient from google.oauth2.service_account import Credentials from requests.adapters import HTTPAdapter from pypdf.errors import PdfReadError from pypdf import PageObject as my_page # @UnusedImport from tqdm import tqdm def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='#'): ''' Call in a loop to create terminal progress bar @params: iteration - Required : current iteration (Int) total - Required : total iterations (Int) prefix - Optional : prefix string (Str) suffix - Optional : suffix string (Str) decimals - Optional : positive number of decimals in percent complete (Int) length - Optional : character length of bar (Int) fill - Optional : bar fill character (Str) ''' percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total))) filledLength = int(length * iteration // total) bar = fill * filledLength + '-' * (length - filledLength) print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r') # Print New Line on Complete if iteration == total: print() class Receipts(object): def __init__(self, cookie=None): self.cookie = cookie # print('object recipisa created') @staticmethod def filter_files(_ext, path=None): '''lists all pdf's in current directory''' if path: my_files = filter(lambda f: f.endswith(_ext), os.listdir(path)) else: my_files = filter(lambda f: f.endswith(_ext), os.listdir()) return my_files def pdf_renamer(self, iter_list, *args): '''renames the file with data from inside pdf :param :iter_list is itarable with the pdf's :param :args are from the pdf''' self.iter_list = iter_list checker = str('_'.join(args).split()) if checker.replace("['", '').replace("']", '') in iter_list: print(iter_list + ' was already renamed', end='\r', flush=True) else: print('renaming adding ' + checker.replace("['", '').replace("']", '') + iter_list) n_name = str(iter_list[:-4] + '_%s' % '_'.join(args).split() + iter_list[-4:]) os.rename(iter_list, n_name.replace("['", '').replace("']", '')) def pdf_reader(self, my_file): # cf_id=slice(34, 35), ind_id=slice(9, 10), data_id=slice(11, 12), reg_id=slice(25, 26), # tip_id=slice(17, 18), luna_id=slice(30, 31), anul_id=slice(31, 32), mesaj_id=slice(35, 1000)): '''reads all the text from pdf (recipisa) depended on indexes :default :args are the indexes ''' self.my_file = my_file # rect = 'rectificativ' # un = 'unica' try: read_my_file = pypdf.PdfReader(my_file) except PdfReadError: raise PdfReadError('receipts still processing on server') my_page = read_my_file.pages[0] my_content = my_page.extract_text() cod_fiscal_re = re.search(u'(?<=CIF: )[0-9]+', my_content) cod_fiscal = cod_fiscal_re.group() index_incarcare = re.search('(?<=Index încărcare: )[0-9]+|(?<=Index înc\?rcare: )[0-9]+', my_content) index_incarcare = index_incarcare.group() data_depunere = re.search(u'(?<=din )[0-9.]+', my_content) data_depunere = data_depunere.group() data_reg = re.search(u'(?<=din data de\n)[0-9.]+|(?<=din data de)[0-9.]+|(?<=din data de )[0-9.]+|(?<=data de )[0-9.]+', my_content) data_reg = data_reg.group() tip = re.search(u'(?<=tip )[A-Za-z0-9]+', my_content) tip = tip.group() luna_re = re.search(u'(?<=raportare )[0-9]+', my_content) if luna_re: luna = luna_re.group() anul = my_content[luna_re.end() + 1:luna_re.end() + 5] else: luna = data_reg[3:5] anul = data_reg[-4:] # anul = anul.group() mesaj = my_content[cod_fiscal_re.end() + 1:cod_fiscal_re.end() + 1001] rect_re = re.compile('[A-Za-z]*ectific[a-z]*') if rect_re.search(my_content): rect1 = 'true' else: rect1 = 'false' self.content = my_content self.rect = rect1 self.cod_fiscal = cod_fiscal self.index_incarcare = index_incarcare self.mesaj = mesaj.strip() self.type_ = tip self.month_ = luna self.year_ = anul self.reg_date = data_reg my_dict = {cod_fiscal: [index_incarcare, data_depunere, data_reg, tip, luna, anul, mesaj[:len(mesaj) - 1], cod_fiscal + tip, rect1]} self.data = my_dict return my_dict def get_rect(self): '''return cod fiscal''' return self.rect def get_cod_fiscal(self): '''return cod fiscal''' return self.cod_fiscal def get_index(self): '''return index de incarcare''' return self.index_incarcare def get_message(self): '''return receipt message''' return self.mesaj def get_type(self): '''return type of the declaration''' return self.type_ def get_month(self): return self.month_ def get_year(self): return self.year_ def get_reg_date(self): return self.reg_date def check_name(self, my_iter): '''check if file already renamed was downloaded already or not''' self.my_iter = my_iter new_path = [] und_score = '_' for i in my_iter: if und_score in i: i = i[:i.index(und_score)] new_path.append(i + '.pdf') return new_path def get_url(self, rindex, ext_, my_url='https://www.anaf.ro/StareD112/ObtineRecipisa?numefisier='): ''' Alternative url if original not working: https://epatrim.anaf.ro/StareD112/ObtineRecipisa?numefisier= ''' '''get recipise server :param rindex is index from google sheet''' urllib3.disable_warnings(InsecureRequestWarning) self.my_url = my_url self.rindex = rindex rindex = rindex + '.pdf' session = requests.Session() retry = Retry(connect=3, backoff_factor=1) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) m_path = os.getcwd() + '\\__cache__\\' + rindex try: m_file = open(m_path, 'wb') except FileNotFoundError: os.mkdir('__cache__') finally: m_file = open(m_path, 'wb') my_eq = '=' if my_eq in str(m_file): my_eq = str(m_file).index(my_eq) + 2 it = '\\' if it in str(m_file)[my_eq:-2]: it = str(m_file)[my_eq:-2][::-1].index(it) + 2 # print(str(m_file)[-it:-2]) if str(m_file)[-it:-2] in self.filter_files(ext_): print(str(m_file)[-it:-2], ' already saved', end='\r', flush=True) m_file.close() elif str(m_file)[-it:-2] in self.check_name(self.filter_files(ext_)): print(str(m_file)[-it:-2], ' already saved', end='\r', flush=True) m_file.close() else: # download if self.cookie is not None: my_req = session.get(my_url + rindex, cookies=self.cookie, verify=False, stream=True) else: my_req = session.get(my_url + rindex, verify=False, stream=True) # finish download if len(my_req.content) > 0: # m_file = open(rindex, 'wb') with open(rindex, 'wb') as m_file: print('writing', str(m_file)[my_eq:-2]) return m_file.write(my_req.content) def r_downloader(self, ext_, cond1=4, cond2=6): # arg=None '''downloads the found receipts''' my_path = os.getcwd().split('\\')[-1:] period_date = self._dir_fdate() print(my_path, 'VS', period_date) my_month = datetime.today() - timedelta(days=datetime.today().day) sel = input('enter to continue, s to select month') if date.today().month == 1: year_ = date.today().year - 1 else: year_ = date.today().year mth_yr = '%s-%s' % (date(1900, my_month.month, 28).strftime('%b'), year_) print('mth_yr', mth_yr) if sel == '': # if my_path == date_l: if my_path == period_date: for i, k in self.cond_range(cond1, cond2): if i != '' and k == mth_yr: self.get_url(i, ext_) elif sel == 's': m = input('Mmm-YYY') for i, k in self.cond_range(cond1, cond2): if i != '' and k == m: self.get_url(i, ext_) def _dir_fdate(self): my_month = datetime.today() - timedelta(days=datetime.today().day) date_l = [] if date.today().month <= 10: if date.today().month == 1: date_l.append('%s %s' % (date.today().year - 1, 12)) else: date_l.append('%s 0%s' % (date.today().year, my_month.month)) else: if date.today().month == 1: date_l.append('%s %s' % (date.today().year - 1, 12)) else: date_l.append('%s %s' % (date.today().year, my_month.month)) self.date_l = date_l return self.date_l def get_gspread(self, json_name, g_sheet_name, sheet=''): '''imports google sheet :param :json file for login :param :g_sheet_name is the workbook name :param :sheet where the indexes are''' self.json_name = json_name scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] # creds = ServiceAccountCredentials.from_json_keyfile_name(json_name, scope) creds = Credentials.from_service_account_file(json_name, scopes=scope) gsheet = gspread.authorize(creds, http_client=BackOffHTTPClient) self.sheet = gsheet.open(g_sheet_name).sheet1 # return sheet def cond_range(self, col1=1, col2=2): '''returns tuple for conditioning download :param col1, col2 are integers, the column numbers''' self.col1 = col1 col1 = self.sheet.col_values(col1)[3:] self.col2 = col2 col2 = self.sheet.col_values(col2)[3:] cond_range = zip(col1, col2) return cond_range def xling(self, xl_param, wsheet_param, ext_, renamer='n', path='', headers=['codfiscal', 'index', 'data_dep', 'data_inreg', 'tip', 'luna', 'anul', 'mesaj', 'id', 'rect']): ''' :param :xl_param = Workbook name :param :w_sheet_param = worksheet name :param :headers = default or user defined''' output_data = xlsxwriter.Workbook(xl_param + '.xlsx') w_sheet = output_data.add_worksheet(wsheet_param) # my_file = open('test.pdf', 'rb') my_row = 0 my_col = 0 print('initiating headers: ', ', '.join(headers)) hcol = 0 print('') for h in headers: headformat = output_data.add_format({'bold': True}) w_sheet.write(0, hcol, h, headformat) print('writing header in column', hcol + 1, 'name', h, end='\r', flush=True) hcol += 1 for my_list in self.filter_files(ext_, path=path): ''' paths = 1 - c:\\Users\\levente.marton\\git\\Hello-World\\omv\\Mzk\\ 2 - c:\\Users\\conta1\\git\\Hello-World\\omv\\Mzk\\ 3 -c:\\Users\\timi\\git\\Hello-World\\omv\\Mzk\\ ''' # self.pdf_reader(my_list) my_dict1 = self.pdf_reader(path + my_list) for i in my_dict1: my_row += 1 print('writing row %s' % my_row, end='\r', flush=True) w_sheet.write(my_row, my_col, i) w_sheet.write_row(my_row, 1, my_dict1[i]) if renamer == 'y': self.pdf_renamer(path + my_list, i, my_dict1[i][3], my_dict1[i][4], my_dict1[i][5]) print('%s.xlsx written & ready' % xl_param) format1 = output_data.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'}) w_sheet.autofilter(0, 0, 100, 9) w_sheet.set_column(7, 7, 50) w_sheet.set_column(8, 8, 15) w_sheet.set_column(0, 6, 11) w_sheet.freeze_panes(1, 0) w_sheet.conditional_format('I2:I500', {'type': 'duplicate', 'format': format1}) output_data.close() return my_row def save_message(self, ext_, col1, col2, offset, *args): # message=-1 '''write receipt messages in column to google sheet''' # for ind, mess in self.cond_range(col1, col2): for ind, mess in tqdm(self.cond_range(col1, col2)): if mess == 'FALSE': for m_files in self.filter_files(ext_): for dummy in self.pdf_reader(m_files): if ind == self.get_index(): print('{0} - {1}-{2}-{3}'.format(self.get_cod_fiscal(), self.get_index(), self.get_type(), self.get_message())) m_ind = 0 # printProgressBar(0, len(args)) for v in tqdm(args): # self.v = v for ind, mess in self.cond_range(col1, col2): if mess == 'FALSE': for m_files in self.filter_files(ext_): for dummy in self.pdf_reader(m_files): if ind == self.get_index(): cell = self.sheet.find(self.get_index()) self.sheet.update_cell(cell.row, col2 - offset[m_ind], v()) # printProgressBar(e + 1, len(args)) m_ind += 1 if __name__ == '__main__': my_recipisa = Receipts() p = my_recipisa.pdf_reader('241707933_17259191_D300_1_2021.pdf') print(my_recipisa.year_) # my_recipisa.get_gspread('Pysheet26134-2daf66659e50.json', 'Recipisa Mozaik') # my_recipisa.r_downloader('.pdf', cond1=3, cond2=5) # my_recipisa.xling('recipise', 'recipise', '.pdf', 'y') # my_recipisa.save_message('.pdf', 3, 10, [1, 6, 8], my_recipisa.get_message, my_recipisa.get_type, my_recipisa.get_cod_fiscal)