receipt.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. import os
  2. import re
  3. import urllib3
  4. from datetime import date, datetime, timedelta
  5. from urllib3.exceptions import InsecureRequestWarning
  6. from urllib3.util.retry import Retry
  7. # from oauth2client.service_account import ServiceAccountCredentials
  8. import pypdf
  9. import gspread
  10. import requests
  11. import xlsxwriter
  12. from gspread import BackOffHTTPClient
  13. from google.oauth2.service_account import Credentials
  14. from requests.adapters import HTTPAdapter
  15. from pypdf.errors import PdfReadError
  16. from pypdf import PageObject as my_page # @UnusedImport
  17. from tqdm import tqdm
  18. def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='#'):
  19. '''
  20. Call in a loop to create terminal progress bar
  21. @params:
  22. iteration - Required : current iteration (Int)
  23. total - Required : total iterations (Int)
  24. prefix - Optional : prefix string (Str)
  25. suffix - Optional : suffix string (Str)
  26. decimals - Optional : positive number of decimals in percent complete (Int)
  27. length - Optional : character length of bar (Int)
  28. fill - Optional : bar fill character (Str)
  29. '''
  30. percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total)))
  31. filledLength = int(length * iteration // total)
  32. bar = fill * filledLength + '-' * (length - filledLength)
  33. print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
  34. # Print New Line on Complete
  35. if iteration == total:
  36. print()
  37. class Receipts(object):
  38. def __init__(self, cookie=None):
  39. self.cookie = cookie
  40. # print('object recipisa created')
  41. @staticmethod
  42. def filter_files(_ext, path=None):
  43. '''lists all pdf's in current directory'''
  44. if path:
  45. my_files = filter(lambda f: f.endswith(_ext), os.listdir(path))
  46. else:
  47. my_files = filter(lambda f: f.endswith(_ext), os.listdir())
  48. return my_files
  49. def pdf_renamer(self, iter_list, *args):
  50. '''renames the file with data from inside pdf
  51. :param :iter_list is itarable with the pdf's
  52. :param :args are from the pdf'''
  53. self.iter_list = iter_list
  54. checker = str('_'.join(args).split())
  55. if checker.replace("['", '').replace("']", '') in iter_list:
  56. print(iter_list + ' was already renamed', end='\r', flush=True)
  57. else:
  58. print('renaming adding ' + checker.replace("['", '').replace("']", '') + iter_list)
  59. n_name = str(iter_list[:-4] + '_%s' % '_'.join(args).split() + iter_list[-4:])
  60. os.rename(iter_list, n_name.replace("['", '').replace("']", ''))
  61. def pdf_reader(self, my_file): # cf_id=slice(34, 35), ind_id=slice(9, 10), data_id=slice(11, 12), reg_id=slice(25, 26),
  62. # tip_id=slice(17, 18), luna_id=slice(30, 31), anul_id=slice(31, 32), mesaj_id=slice(35, 1000)):
  63. '''reads all the text from pdf (recipisa) depended on indexes
  64. :default :args are the indexes '''
  65. self.my_file = my_file
  66. # rect = 'rectificativ'
  67. # un = 'unica'
  68. try:
  69. read_my_file = pypdf.PdfReader(my_file)
  70. except PdfReadError:
  71. raise PdfReadError('receipts still processing on server')
  72. my_page = read_my_file.pages[0]
  73. my_content = my_page.extract_text()
  74. cod_fiscal_re = re.search(u'(?<=CIF: )[0-9]+', my_content)
  75. cod_fiscal = cod_fiscal_re.group()
  76. index_incarcare = re.search('(?<=Index încărcare: )[0-9]+|(?<=Index înc\?rcare: )[0-9]+', my_content)
  77. index_incarcare = index_incarcare.group()
  78. data_depunere = re.search(u'(?<=din )[0-9.]+', my_content)
  79. data_depunere = data_depunere.group()
  80. data_reg = re.search(u'(?<=din data de\n)[0-9.]+|(?<=din data de)[0-9.]+|(?<=din data de )[0-9.]+|(?<=data de )[0-9.]+', my_content)
  81. data_reg = data_reg.group()
  82. tip = re.search(u'(?<=tip )[A-Za-z0-9]+', my_content)
  83. tip = tip.group()
  84. luna_re = re.search(u'(?<=raportare )[0-9]+', my_content)
  85. if luna_re:
  86. luna = luna_re.group()
  87. anul = my_content[luna_re.end() + 1:luna_re.end() + 5]
  88. else:
  89. luna = data_reg[3:5]
  90. anul = data_reg[-4:]
  91. # anul = anul.group()
  92. mesaj = my_content[cod_fiscal_re.end() + 1:cod_fiscal_re.end() + 1001]
  93. rect_re = re.compile('[A-Za-z]*ectific[a-z]*')
  94. if rect_re.search(my_content):
  95. rect1 = 'true'
  96. else:
  97. rect1 = 'false'
  98. self.content = my_content
  99. self.rect = rect1
  100. self.cod_fiscal = cod_fiscal
  101. self.index_incarcare = index_incarcare
  102. self.mesaj = mesaj.strip()
  103. self.type_ = tip
  104. self.month_ = luna
  105. self.year_ = anul
  106. self.reg_date = data_reg
  107. my_dict = {cod_fiscal: [index_incarcare, data_depunere, data_reg, tip, luna, anul, mesaj[:len(mesaj) - 1], cod_fiscal + tip, rect1]}
  108. self.data = my_dict
  109. return my_dict
  110. def get_rect(self):
  111. '''return cod fiscal'''
  112. return self.rect
  113. def get_cod_fiscal(self):
  114. '''return cod fiscal'''
  115. return self.cod_fiscal
  116. def get_index(self):
  117. '''return index de incarcare'''
  118. return self.index_incarcare
  119. def get_message(self):
  120. '''return receipt message'''
  121. return self.mesaj
  122. def get_type(self):
  123. '''return type of the declaration'''
  124. return self.type_
  125. def get_month(self):
  126. return self.month_
  127. def get_year(self):
  128. return self.year_
  129. def get_reg_date(self):
  130. return self.reg_date
  131. def check_name(self, my_iter):
  132. '''check if file already renamed was downloaded already or not'''
  133. self.my_iter = my_iter
  134. new_path = []
  135. und_score = '_'
  136. for i in my_iter:
  137. if und_score in i:
  138. i = i[:i.index(und_score)]
  139. new_path.append(i + '.pdf')
  140. return new_path
  141. def get_url(self, rindex, ext_, my_url='https://www.anaf.ro/StareD112/ObtineRecipisa?numefisier='):
  142. '''
  143. Alternative url if original not working:
  144. https://epatrim.anaf.ro/StareD112/ObtineRecipisa?numefisier=
  145. '''
  146. '''get recipise server
  147. :param rindex is index from google sheet'''
  148. urllib3.disable_warnings(InsecureRequestWarning)
  149. self.my_url = my_url
  150. self.rindex = rindex
  151. rindex = rindex + '.pdf'
  152. session = requests.Session()
  153. retry = Retry(connect=3, backoff_factor=1)
  154. adapter = HTTPAdapter(max_retries=retry)
  155. session.mount('http://', adapter)
  156. session.mount('https://', adapter)
  157. m_path = os.getcwd() + '\\__cache__\\' + rindex
  158. try:
  159. m_file = open(m_path, 'wb')
  160. except FileNotFoundError:
  161. os.mkdir('__cache__')
  162. finally:
  163. m_file = open(m_path, 'wb')
  164. my_eq = '='
  165. if my_eq in str(m_file):
  166. my_eq = str(m_file).index(my_eq) + 2
  167. it = '\\'
  168. if it in str(m_file)[my_eq:-2]:
  169. it = str(m_file)[my_eq:-2][::-1].index(it) + 2
  170. # print(str(m_file)[-it:-2])
  171. if str(m_file)[-it:-2] in self.filter_files(ext_):
  172. print(str(m_file)[-it:-2], ' already saved', end='\r', flush=True)
  173. m_file.close()
  174. elif str(m_file)[-it:-2] in self.check_name(self.filter_files(ext_)):
  175. print(str(m_file)[-it:-2], ' already saved', end='\r', flush=True)
  176. m_file.close()
  177. else:
  178. # download
  179. if self.cookie is not None:
  180. my_req = session.get(my_url + rindex, cookies=self.cookie, verify=False, stream=True)
  181. else:
  182. my_req = session.get(my_url + rindex, verify=False, stream=True)
  183. # finish download
  184. if len(my_req.content) > 0:
  185. # m_file = open(rindex, 'wb')
  186. with open(rindex, 'wb') as m_file:
  187. print('writing', str(m_file)[my_eq:-2])
  188. return m_file.write(my_req.content)
  189. def r_downloader(self, ext_, cond1=4, cond2=6): # arg=None
  190. '''downloads the found receipts'''
  191. my_path = os.getcwd().split('\\')[-1:]
  192. period_date = self._dir_fdate()
  193. print(my_path, 'VS', period_date)
  194. my_month = datetime.today() - timedelta(days=datetime.today().day)
  195. sel = input('enter to continue, s to select month')
  196. if date.today().month == 1:
  197. year_ = date.today().year - 1
  198. else:
  199. year_ = date.today().year
  200. mth_yr = '%s-%s' % (date(1900, my_month.month, 28).strftime('%b'), year_)
  201. print('mth_yr', mth_yr)
  202. if sel == '':
  203. # if my_path == date_l:
  204. if my_path == period_date:
  205. for i, k in self.cond_range(cond1, cond2):
  206. if i != '' and k == mth_yr:
  207. self.get_url(i, ext_)
  208. elif sel == 's':
  209. m = input('Mmm-YYY')
  210. for i, k in self.cond_range(cond1, cond2):
  211. if i != '' and k == m:
  212. self.get_url(i, ext_)
  213. def _dir_fdate(self):
  214. my_month = datetime.today() - timedelta(days=datetime.today().day)
  215. date_l = []
  216. if date.today().month <= 10:
  217. if date.today().month == 1:
  218. date_l.append('%s %s' % (date.today().year - 1, 12))
  219. else:
  220. date_l.append('%s 0%s' % (date.today().year, my_month.month))
  221. else:
  222. if date.today().month == 1:
  223. date_l.append('%s %s' % (date.today().year - 1, 12))
  224. else:
  225. date_l.append('%s %s' % (date.today().year, my_month.month))
  226. self.date_l = date_l
  227. return self.date_l
  228. def get_gspread(self, json_name, g_sheet_name, sheet=''):
  229. '''imports google sheet
  230. :param :json file for login
  231. :param :g_sheet_name is the workbook name
  232. :param :sheet where the indexes are'''
  233. self.json_name = json_name
  234. scope = ['https://spreadsheets.google.com/feeds',
  235. 'https://www.googleapis.com/auth/drive']
  236. # creds = ServiceAccountCredentials.from_json_keyfile_name(json_name, scope)
  237. creds = Credentials.from_service_account_file(json_name, scopes=scope)
  238. gsheet = gspread.authorize(creds, http_client=BackOffHTTPClient)
  239. self.sheet = gsheet.open(g_sheet_name).sheet1
  240. # return sheet
  241. def cond_range(self, col1=1, col2=2):
  242. '''returns tuple for conditioning download
  243. :param col1, col2 are integers, the column numbers'''
  244. self.col1 = col1
  245. col1 = self.sheet.col_values(col1)[3:]
  246. self.col2 = col2
  247. col2 = self.sheet.col_values(col2)[3:]
  248. cond_range = zip(col1, col2)
  249. return cond_range
  250. def xling(self, xl_param, wsheet_param, ext_, renamer='n', path='',
  251. headers=['codfiscal', 'index', 'data_dep', 'data_inreg', 'tip', 'luna', 'anul', 'mesaj', 'id', 'rect']):
  252. '''
  253. :param :xl_param = Workbook name
  254. :param :w_sheet_param = worksheet name
  255. :param :headers = default or user defined'''
  256. output_data = xlsxwriter.Workbook(xl_param + '.xlsx')
  257. w_sheet = output_data.add_worksheet(wsheet_param)
  258. # my_file = open('test.pdf', 'rb')
  259. my_row = 0
  260. my_col = 0
  261. print('initiating headers: ', ', '.join(headers))
  262. hcol = 0
  263. print('')
  264. for h in headers:
  265. headformat = output_data.add_format({'bold': True})
  266. w_sheet.write(0, hcol, h, headformat)
  267. print('writing header in column', hcol + 1, 'name', h, end='\r', flush=True)
  268. hcol += 1
  269. for my_list in self.filter_files(ext_, path=path):
  270. '''
  271. paths = 1 - c:\\Users\\levente.marton\\git\\Hello-World\\omv\\Mzk\\
  272. 2 - c:\\Users\\conta1\\git\\Hello-World\\omv\\Mzk\\ 3 -c:\\Users\\timi\\git\\Hello-World\\omv\\Mzk\\
  273. '''
  274. # self.pdf_reader(my_list)
  275. my_dict1 = self.pdf_reader(path + my_list)
  276. for i in my_dict1:
  277. my_row += 1
  278. print('writing row %s' % my_row, end='\r', flush=True)
  279. w_sheet.write(my_row, my_col, i)
  280. w_sheet.write_row(my_row, 1, my_dict1[i])
  281. if renamer == 'y':
  282. self.pdf_renamer(path + my_list, i, my_dict1[i][3], my_dict1[i][4], my_dict1[i][5])
  283. print('%s.xlsx written & ready' % xl_param)
  284. format1 = output_data.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'})
  285. w_sheet.autofilter(0, 0, 100, 9)
  286. w_sheet.set_column(7, 7, 50)
  287. w_sheet.set_column(8, 8, 15)
  288. w_sheet.set_column(0, 6, 11)
  289. w_sheet.freeze_panes(1, 0)
  290. w_sheet.conditional_format('I2:I500', {'type': 'duplicate', 'format': format1})
  291. output_data.close()
  292. return my_row
  293. def save_message(self, ext_, col1, col2, offset, *args): # message=-1
  294. '''write receipt messages in column to google sheet'''
  295. # for ind, mess in self.cond_range(col1, col2):
  296. for ind, mess in tqdm(self.cond_range(col1, col2)):
  297. if mess == 'FALSE':
  298. for m_files in self.filter_files(ext_):
  299. for dummy in self.pdf_reader(m_files):
  300. if ind == self.get_index():
  301. print('{0} - {1}-{2}-{3}'.format(self.get_cod_fiscal(), self.get_index(), self.get_type(), self.get_message()))
  302. m_ind = 0
  303. # printProgressBar(0, len(args))
  304. for v in tqdm(args):
  305. # self.v = v
  306. for ind, mess in self.cond_range(col1, col2):
  307. if mess == 'FALSE':
  308. for m_files in self.filter_files(ext_):
  309. for dummy in self.pdf_reader(m_files):
  310. if ind == self.get_index():
  311. cell = self.sheet.find(self.get_index())
  312. self.sheet.update_cell(cell.row, col2 - offset[m_ind], v())
  313. # printProgressBar(e + 1, len(args))
  314. m_ind += 1
  315. if __name__ == '__main__':
  316. my_recipisa = Receipts()
  317. p = my_recipisa.pdf_reader('241707933_17259191_D300_1_2021.pdf')
  318. print(my_recipisa.year_)
  319. # my_recipisa.get_gspread('Pysheet26134-2daf66659e50.json', 'Recipisa Mozaik')
  320. # my_recipisa.r_downloader('.pdf', cond1=3, cond2=5)
  321. # my_recipisa.xling('recipise', 'recipise', '.pdf', 'y')
  322. # my_recipisa.save_message('.pdf', 3, 10, [1, 6, 8], my_recipisa.get_message, my_recipisa.get_type, my_recipisa.get_cod_fiscal)