123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349 |
- import os
- import re
- import urllib3
- from datetime import date, datetime, timedelta
- from urllib3.exceptions import InsecureRequestWarning
- from urllib3.util.retry import Retry
- # from oauth2client.service_account import ServiceAccountCredentials
- import pypdf
- import gspread
- import requests
- import xlsxwriter
- from gspread import BackOffHTTPClient
- from google.oauth2.service_account import Credentials
- from requests.adapters import HTTPAdapter
- from pypdf.errors import PdfReadError
- from pypdf import PageObject as my_page # @UnusedImport
- from tqdm import tqdm
- def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='#'):
- '''
- Call in a loop to create terminal progress bar
- @params:
- iteration - Required : current iteration (Int)
- total - Required : total iterations (Int)
- prefix - Optional : prefix string (Str)
- suffix - Optional : suffix string (Str)
- decimals - Optional : positive number of decimals in percent complete (Int)
- length - Optional : character length of bar (Int)
- fill - Optional : bar fill character (Str)
- '''
- percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total)))
- filledLength = int(length * iteration // total)
- bar = fill * filledLength + '-' * (length - filledLength)
- print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
- # Print New Line on Complete
- if iteration == total:
- print()
- class Receipts(object):
- def __init__(self, cookie=None):
- self.cookie = cookie
- # print('object recipisa created')
- @staticmethod
- def filter_files(_ext, path=None):
- '''lists all pdf's in current directory'''
- if path:
- my_files = filter(lambda f: f.endswith(_ext), os.listdir(path))
- else:
- my_files = filter(lambda f: f.endswith(_ext), os.listdir())
- return my_files
- def pdf_renamer(self, iter_list, *args):
- '''renames the file with data from inside pdf
- :param :iter_list is itarable with the pdf's
- :param :args are from the pdf'''
- self.iter_list = iter_list
- checker = str('_'.join(args).split())
- if checker.replace("['", '').replace("']", '') in iter_list:
- print(iter_list + ' was already renamed', end='\r', flush=True)
- else:
- print('renaming adding ' + checker.replace("['", '').replace("']", '') + iter_list)
- n_name = str(iter_list[:-4] + '_%s' % '_'.join(args).split() + iter_list[-4:])
- os.rename(iter_list, n_name.replace("['", '').replace("']", ''))
- def pdf_reader(self, my_file): # cf_id=slice(34, 35), ind_id=slice(9, 10), data_id=slice(11, 12), reg_id=slice(25, 26),
- # tip_id=slice(17, 18), luna_id=slice(30, 31), anul_id=slice(31, 32), mesaj_id=slice(35, 1000)):
- '''reads all the text from pdf (recipisa) depended on indexes
- :default :args are the indexes '''
- self.my_file = my_file
- # rect = 'rectificativ'
- # un = 'unica'
- try:
- read_my_file = pypdf.PdfReader(my_file)
- except PdfReadError:
- raise PdfReadError('receipts still processing on server')
- my_page = read_my_file.pages[0]
- my_content = my_page.extract_text()
- cod_fiscal_re = re.search(u'(?<=CIF: )[0-9]+', my_content)
- cod_fiscal = cod_fiscal_re.group()
- index_incarcare = re.search('(?<=Index încărcare: )[0-9]+|(?<=Index înc\?rcare: )[0-9]+', my_content)
- index_incarcare = index_incarcare.group()
- data_depunere = re.search(u'(?<=din )[0-9.]+', my_content)
- data_depunere = data_depunere.group()
- data_reg = re.search(u'(?<=din data de\n)[0-9.]+|(?<=din data de)[0-9.]+|(?<=din data de )[0-9.]+', my_content)
- data_reg = data_reg.group()
- tip = re.search(u'(?<=tip )[A-Za-z0-9]+', my_content)
- tip = tip.group()
- luna_re = re.search(u'(?<=raportare )[0-9]+', my_content)
- if luna_re:
- luna = luna_re.group()
- anul = my_content[luna_re.end() + 1:luna_re.end() + 5]
- else:
- luna = data_reg[3:5]
- anul = data_reg[-4:]
- # anul = anul.group()
- mesaj = my_content[cod_fiscal_re.end() + 1:cod_fiscal_re.end() + 1001]
- rect_re = re.compile('[A-Za-z]*ectific[a-z]*')
- if rect_re.search(my_content):
- rect1 = 'true'
- else:
- rect1 = 'false'
- self.content = my_content
- self.rect = rect1
- self.cod_fiscal = cod_fiscal
- self.index_incarcare = index_incarcare
- self.mesaj = mesaj.strip()
- self.type_ = tip
- self.month_ = luna
- self.year_ = anul
- self.reg_date = data_reg
- my_dict = {cod_fiscal: [index_incarcare, data_depunere, data_reg, tip, luna, anul, mesaj[:len(mesaj) - 1], cod_fiscal + tip, rect1]}
- self.data = my_dict
- return my_dict
- def get_rect(self):
- '''return cod fiscal'''
- return self.rect
- def get_cod_fiscal(self):
- '''return cod fiscal'''
- return self.cod_fiscal
- def get_index(self):
- '''return index de incarcare'''
- return self.index_incarcare
- def get_message(self):
- '''return receipt message'''
- return self.mesaj
- def get_type(self):
- '''return type of the declaration'''
- return self.type_
- def get_month(self):
- return self.month_
- def get_year(self):
- return self.year_
- def get_reg_date(self):
- return self.reg_date
- def check_name(self, my_iter):
- '''check if file already renamed was downloaded already or not'''
- self.my_iter = my_iter
- new_path = []
- und_score = '_'
- for i in my_iter:
- if und_score in i:
- i = i[:i.index(und_score)]
- new_path.append(i + '.pdf')
- return new_path
- def get_url(self, rindex, ext_, my_url='https://www.anaf.ro/StareD112/ObtineRecipisa?numefisier='):
- '''
- Alternative url if original not working:
- https://epatrim.anaf.ro/StareD112/ObtineRecipisa?numefisier=
- '''
- '''get recipise server
- :param rindex is index from google sheet'''
- urllib3.disable_warnings(InsecureRequestWarning)
- self.my_url = my_url
- self.rindex = rindex
- rindex = rindex + '.pdf'
- session = requests.Session()
- retry = Retry(connect=3, backoff_factor=1)
- adapter = HTTPAdapter(max_retries=retry)
- session.mount('http://', adapter)
- session.mount('https://', adapter)
- m_path = os.getcwd() + '\\__cache__\\' + rindex
- try:
- m_file = open(m_path, 'wb')
- except FileNotFoundError:
- os.mkdir('__cache__')
- finally:
- m_file = open(m_path, 'wb')
- my_eq = '='
- if my_eq in str(m_file):
- my_eq = str(m_file).index(my_eq) + 2
- it = '\\'
- if it in str(m_file)[my_eq:-2]:
- it = str(m_file)[my_eq:-2][::-1].index(it) + 2
- # print(str(m_file)[-it:-2])
- if str(m_file)[-it:-2] in self.filter_files(ext_):
- print(str(m_file)[-it:-2], ' already saved', end='\r', flush=True)
- m_file.close()
- elif str(m_file)[-it:-2] in self.check_name(self.filter_files(ext_)):
- print(str(m_file)[-it:-2], ' already saved', end='\r', flush=True)
- m_file.close()
- else:
- # download
- if self.cookie is not None:
- my_req = session.get(my_url + rindex, cookies=self.cookie, verify=False, stream=True)
- else:
- my_req = session.get(my_url + rindex, verify=False, stream=True)
- # finish download
- if len(my_req.content) > 0:
- # m_file = open(rindex, 'wb')
- with open(rindex, 'wb') as m_file:
- print('writing', str(m_file)[my_eq:-2])
- return m_file.write(my_req.content)
- def r_downloader(self, ext_, cond1=4, cond2=6): # arg=None
- '''downloads the found receipts'''
- my_path = os.getcwd().split('\\')[-1:]
- period_date = self._dir_fdate()
- print(my_path, 'VS', period_date)
- my_month = datetime.today() - timedelta(days=datetime.today().day)
- sel = input('enter to continue, s to select month')
- if date.today().month == 1:
- year_ = date.today().year - 1
- else:
- year_ = date.today().year
- mth_yr = '%s-%s' % (date(1900, my_month.month, 28).strftime('%b'), year_)
- print('mth_yr', mth_yr)
- if sel == '':
- # if my_path == date_l:
- if my_path == period_date:
- for i, k in self.cond_range(cond1, cond2):
- if i != '' and k == mth_yr:
- self.get_url(i, ext_)
- elif sel == 's':
- m = input('Mmm-YYY')
- for i, k in self.cond_range(cond1, cond2):
- if i != '' and k == m:
- self.get_url(i, ext_)
- def _dir_fdate(self):
- my_month = datetime.today() - timedelta(days=datetime.today().day)
- date_l = []
- if date.today().month <= 10:
- if date.today().month == 1:
- date_l.append('%s %s' % (date.today().year - 1, 12))
- else:
- date_l.append('%s 0%s' % (date.today().year, my_month.month))
- else:
- if date.today().month == 1:
- date_l.append('%s %s' % (date.today().year - 1, 12))
- else:
- date_l.append('%s %s' % (date.today().year, my_month.month))
- self.date_l = date_l
- return self.date_l
- def get_gspread(self, json_name, g_sheet_name, sheet=''):
- '''imports google sheet
- :param :json file for login
- :param :g_sheet_name is the workbook name
- :param :sheet where the indexes are'''
- self.json_name = json_name
- scope = ['https://spreadsheets.google.com/feeds',
- 'https://www.googleapis.com/auth/drive']
- # creds = ServiceAccountCredentials.from_json_keyfile_name(json_name, scope)
- creds = Credentials.from_service_account_file(json_name, scopes=scope)
- gsheet = gspread.authorize(creds, http_client=BackOffHTTPClient)
- self.sheet = gsheet.open(g_sheet_name).sheet1
- # return sheet
- def cond_range(self, col1=1, col2=2):
- '''returns tuple for conditioning download
- :param col1, col2 are integers, the column numbers'''
- self.col1 = col1
- col1 = self.sheet.col_values(col1)[3:]
- self.col2 = col2
- col2 = self.sheet.col_values(col2)[3:]
- cond_range = zip(col1, col2)
- return cond_range
- def xling(self, xl_param, wsheet_param, ext_, renamer='n', path='',
- headers=['codfiscal', 'index', 'data_dep', 'data_inreg', 'tip', 'luna', 'anul', 'mesaj', 'id', 'rect']):
- '''
- :param :xl_param = Workbook name
- :param :w_sheet_param = worksheet name
- :param :headers = default or user defined'''
- output_data = xlsxwriter.Workbook(xl_param + '.xlsx')
- w_sheet = output_data.add_worksheet(wsheet_param)
- # my_file = open('test.pdf', 'rb')
- my_row = 0
- my_col = 0
- print('initiating headers: ', ', '.join(headers))
- hcol = 0
- print('')
- for h in headers:
- headformat = output_data.add_format({'bold': True})
- w_sheet.write(0, hcol, h, headformat)
- print('writing header in column', hcol + 1, 'name', h, end='\r', flush=True)
- hcol += 1
- for my_list in self.filter_files(ext_, path=path):
- '''
- paths = 1 - c:\\Users\\levente.marton\\git\\Hello-World\\omv\\Mzk\\
- 2 - c:\\Users\\conta1\\git\\Hello-World\\omv\\Mzk\\ 3 -c:\\Users\\timi\\git\\Hello-World\\omv\\Mzk\\
- '''
- # self.pdf_reader(my_list)
- my_dict1 = self.pdf_reader(path + my_list)
- for i in my_dict1:
- my_row += 1
- print('writing row %s' % my_row, end='\r', flush=True)
- w_sheet.write(my_row, my_col, i)
- w_sheet.write_row(my_row, 1, my_dict1[i])
- if renamer == 'y':
- self.pdf_renamer(path + my_list, i, my_dict1[i][3], my_dict1[i][4], my_dict1[i][5])
- print('%s.xlsx written & ready' % xl_param)
- format1 = output_data.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'})
- w_sheet.autofilter(0, 0, 100, 9)
- w_sheet.set_column(7, 7, 50)
- w_sheet.set_column(8, 8, 15)
- w_sheet.set_column(0, 6, 11)
- w_sheet.freeze_panes(1, 0)
- w_sheet.conditional_format('I2:I500', {'type': 'duplicate', 'format': format1})
- output_data.close()
- return my_row
- def save_message(self, ext_, col1, col2, offset, *args): # message=-1
- '''write receipt messages in column to google sheet'''
- # for ind, mess in self.cond_range(col1, col2):
- for ind, mess in tqdm(self.cond_range(col1, col2)):
- if mess == 'FALSE':
- for m_files in self.filter_files(ext_):
- for dummy in self.pdf_reader(m_files):
- if ind == self.get_index():
- print('{0} - {1}-{2}-{3}'.format(self.get_cod_fiscal(), self.get_index(), self.get_type(), self.get_message()))
- m_ind = 0
- # printProgressBar(0, len(args))
- for v in tqdm(args):
- # self.v = v
- for ind, mess in self.cond_range(col1, col2):
- if mess == 'FALSE':
- for m_files in self.filter_files(ext_):
- for dummy in self.pdf_reader(m_files):
- if ind == self.get_index():
- cell = self.sheet.find(self.get_index())
- self.sheet.update_cell(cell.row, col2 - offset[m_ind], v())
- # printProgressBar(e + 1, len(args))
- m_ind += 1
- if __name__ == '__main__':
- my_recipisa = Receipts()
- p = my_recipisa.pdf_reader('241707933_17259191_D300_1_2021.pdf')
- print(my_recipisa.year_)
- # my_recipisa.get_gspread('Pysheet26134-2daf66659e50.json', 'Recipisa Mozaik')
- # my_recipisa.r_downloader('.pdf', cond1=3, cond2=5)
- # my_recipisa.xling('recipise', 'recipise', '.pdf', 'y')
- # my_recipisa.save_message('.pdf', 3, 10, [1, 6, 8], my_recipisa.get_message, my_recipisa.get_type, my_recipisa.get_cod_fiscal)
|