receipt2.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. import os
  2. import re
  3. import urllib3
  4. from datetime import date, datetime, timedelta
  5. from urllib3.exceptions import InsecureRequestWarning
  6. from urllib3.util.retry import Retry
  7. # from oauth2client.service_account import ServiceAccountCredentials
  8. import PyPDF225
  9. import gspread
  10. import requests
  11. import xlsxwriter
  12. from gspread.client import BackoffClient
  13. from google.oauth2.service_account import Credentials
  14. from requests.adapters import HTTPAdapter
  15. from PyPDF225.errors import PdfReadError
  16. from PyPDF225 import PageObject as my_page
  17. from tqdm import tqdm
  18. def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='#'):
  19. '''
  20. Call in a loop to create terminal progress bar
  21. @params:
  22. iteration - Required : current iteration (Int)
  23. total - Required : total iterations (Int)
  24. prefix - Optional : prefix string (Str)
  25. suffix - Optional : suffix string (Str)
  26. decimals - Optional : positive number of decimals in percent complete (Int)
  27. length - Optional : character length of bar (Int)
  28. fill - Optional : bar fill character (Str)
  29. '''
  30. percent = ('{0:.' + str(decimals) + 'f}').format(100 * (iteration / float(total)))
  31. filledLength = int(length * iteration // total)
  32. bar = fill * filledLength + '-' * (length - filledLength)
  33. print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
  34. # Print New Line on Complete
  35. if iteration == total:
  36. print()
  37. class Receipts(object):
  38. def __init__(self, cookie=None):
  39. self.cookie = cookie
  40. # print('object recipisa created')
  41. @staticmethod
  42. def filter_files(_ext, path=None):
  43. '''lists all pdf's in current directory'''
  44. if path:
  45. my_files = filter(lambda f: f.endswith(_ext), os.listdir(path))
  46. else:
  47. my_files = filter(lambda f: f.endswith(_ext), os.listdir())
  48. return my_files
  49. def pdf_renamer(self, iter_list, *args):
  50. '''renames the file with data from inside pdf
  51. :param :iter_list is itarable with the pdf's
  52. :param :args are from the pdf'''
  53. self.iter_list = iter_list
  54. checker = str('_'.join(args).split())
  55. if checker.replace("['", '').replace("']", '') in iter_list:
  56. print(iter_list + ' was already renamed', end='\r', flush=True)
  57. else:
  58. print('renaming adding ' + checker.replace("['", '').replace("']", '') + iter_list)
  59. n_name = str(iter_list[:-4] + '_%s' % '_'.join(args).split() + iter_list[-4:])
  60. os.rename(iter_list, n_name.replace("['", '').replace("']", ''))
  61. def pdf_reader(self, my_file): # cf_id=slice(34, 35), ind_id=slice(9, 10), data_id=slice(11, 12), reg_id=slice(25, 26),
  62. # tip_id=slice(17, 18), luna_id=slice(30, 31), anul_id=slice(31, 32), mesaj_id=slice(35, 1000)):
  63. '''reads all the text from pdf (recipisa) depended on indexes
  64. :default :args are the indexes '''
  65. self.my_file = my_file
  66. # rect = 'rectificativ'
  67. # un = 'unica'
  68. try:
  69. read_my_file = PyPDF225.PdfFileReader(my_file)
  70. except PdfReadError:
  71. raise PdfReadError('receipts still processing on server')
  72. my_page = read_my_file.pages[0]
  73. my_content = my_page.extract_text()
  74. cod_fiscal_re = re.search(u'(?<=CIF: )[0-9]+', my_content) # ' '.join(my_content.split()[cf_id])
  75. cod_fiscal = cod_fiscal_re.group()
  76. index_incarcare = re.search(u'(?<=Index încărcare: )[0-9]+', my_content) # ' '.join(my_content.split()[ind_id])
  77. index_incarcare = index_incarcare.group()
  78. data_depunere = re.search(u'(?<=din )[0-9.]+', my_content) # ' '.join(my_content.split()[data_id])
  79. data_depunere = data_depunere.group()
  80. data_reg = re.search(u'(?<=din data de\n)[0-9.]+|(?<=din data de)[0-9.]+', my_content) # ' '.join(my_content.split()[reg_id])[2:]
  81. data_reg = data_reg.group()
  82. tip = re.search(u'(?<=tip )[A-Za-z0-9]+', my_content) # ' '.join(my_content.split()[tip_id])
  83. tip = tip.group()
  84. luna_re = re.search(u'(?<=raportare )[0-9]+', my_content) # ' '.join(my_content.split()[luna_id])
  85. if luna_re:
  86. luna = luna_re.group()
  87. anul = my_content[luna_re.end() + 1:luna_re.end() + 5] # ' '.join(my_content.split()[anul_id])
  88. else:
  89. luna = data_reg[3:5]
  90. anul = data_reg[-4:]
  91. # anul = anul.group()
  92. mesaj = my_content[cod_fiscal_re.end() + 1:cod_fiscal_re.end() + 1001] # ' '.join(my_content.split()[mesaj_id])
  93. rect_re = re.compile('[A-Za-z]*ectific[a-z]*') # ' '.join(my_content.split()[18])
  94. if rect_re.search(my_content):
  95. rect1 = 'true'
  96. else:
  97. rect1 = 'false'
  98. self.content = my_content
  99. self.rect = rect1
  100. self.cod_fiscal = cod_fiscal
  101. self.index_incarcare = index_incarcare
  102. self.mesaj = mesaj
  103. self.type_ = tip
  104. self.month_ = luna
  105. self.year_ = anul
  106. self.reg_date = data_reg
  107. my_dict = {cod_fiscal: [index_incarcare, data_depunere, data_reg, tip, luna, anul, mesaj[:len(mesaj) - 1], cod_fiscal + tip, rect1]}
  108. self.data = my_dict
  109. return my_dict
  110. def get_rect(self):
  111. '''return cod fiscal'''
  112. return self.rect
  113. def get_cod_fiscal(self):
  114. '''return cod fiscal'''
  115. return self.cod_fiscal
  116. def get_index(self):
  117. '''return index de incarcare'''
  118. return self.index_incarcare
  119. def get_message(self):
  120. '''return receipt message'''
  121. return self.mesaj
  122. def get_type(self):
  123. '''return type of the declaration'''
  124. return self.type_
  125. def get_month(self):
  126. return self.month_
  127. def get_year(self):
  128. return self.year_
  129. def get_reg_date(self):
  130. return self.reg_date
  131. def check_name(self, my_iter):
  132. '''check if file already renamed was downloaded already or not'''
  133. self.my_iter = my_iter
  134. new_path = []
  135. und_score = '_'
  136. for i in my_iter:
  137. if und_score in i:
  138. i = i[:i.index(und_score)]
  139. new_path.append(i + '.pdf')
  140. return new_path
  141. def get_url(self, rindex, ext_, my_url='https://www.anaf.ro/StareD112/ObtineRecipisa?numefisier='):
  142. '''get recipise server
  143. :param rindex is index from google sheet'''
  144. urllib3.disable_warnings(InsecureRequestWarning)
  145. self.my_url = my_url
  146. self.rindex = rindex
  147. rindex = rindex + '.pdf'
  148. session = requests.Session()
  149. retry = Retry(connect=3, backoff_factor=1)
  150. adapter = HTTPAdapter(max_retries=retry)
  151. session.mount('http://', adapter)
  152. session.mount('https://', adapter)
  153. m_path = os.getcwd() + '\\__cache__\\' + rindex
  154. try:
  155. m_file = open(m_path, 'wb')
  156. except FileNotFoundError:
  157. os.mkdir('__cache__')
  158. finally:
  159. m_file = open(m_path, 'wb')
  160. my_eq = '='
  161. if my_eq in str(m_file):
  162. my_eq = str(m_file).index(my_eq) + 2
  163. it = '\\'
  164. if it in str(m_file)[my_eq:-2]:
  165. it = str(m_file)[my_eq:-2][::-1].index(it) + 2
  166. # print(str(m_file)[-it:-2])
  167. if str(m_file)[-it:-2] in self.filter_files(ext_):
  168. print(str(m_file)[-it:-2], ' already saved', end='\r', flush=True)
  169. m_file.close()
  170. elif str(m_file)[-it:-2] in self.check_name(self.filter_files(ext_)):
  171. print(str(m_file)[-it:-2], ' already saved', end='\r', flush=True)
  172. m_file.close()
  173. else:
  174. # download
  175. if self.cookie is not None:
  176. my_req = session.get(my_url + rindex, cookies=self.cookie, verify=False, stream=True)
  177. else:
  178. my_req = session.get(my_url + rindex, verify=False, stream=True)
  179. # finish download
  180. if len(my_req.content) > 0:
  181. # m_file = open(rindex, 'wb')
  182. with open(rindex, 'wb') as m_file:
  183. print('writing', str(m_file)[my_eq:-2])
  184. return m_file.write(my_req.content)
  185. def r_downloader(self, ext_, cond1=4, cond2=6): # arg=None
  186. '''downloads the found receipts'''
  187. my_path = os.getcwd().split('\\')[-1:]
  188. period_date = self._dir_fdate()
  189. print(my_path, 'VS', period_date)
  190. my_month = datetime.today() - timedelta(days=datetime.today().day)
  191. sel = input('enter to continue, s to select month')
  192. if date.today().month == 1:
  193. year_ = date.today().year - 1
  194. else:
  195. year_ = date.today().year
  196. mth_yr = '%s-%s' % (date(1900, my_month.month, 28).strftime('%b'), year_)
  197. print('mth_yr', mth_yr)
  198. if sel == '':
  199. # if my_path == date_l:
  200. if my_path == period_date:
  201. for i, k in self.cond_range(cond1, cond2):
  202. if i != '' and k == mth_yr:
  203. self.get_url(i, ext_)
  204. elif sel == 's':
  205. m = input('Mmm-YYY')
  206. for i, k in self.cond_range(cond1, cond2):
  207. if i != '' and k == m:
  208. self.get_url(i, ext_)
  209. def _dir_fdate(self):
  210. my_month = datetime.today() - timedelta(days=datetime.today().day)
  211. date_l = []
  212. if date.today().month <= 10:
  213. if date.today().month == 1:
  214. date_l.append('%s %s' % (date.today().year - 1, 12))
  215. else:
  216. date_l.append('%s 0%s' % (date.today().year, my_month.month))
  217. else:
  218. if date.today().month == 1:
  219. date_l.append('%s %s' % (date.today().year - 1, 12))
  220. else:
  221. date_l.append('%s %s' % (date.today().year, my_month.month))
  222. self.date_l = date_l
  223. return self.date_l
  224. def get_gspread(self, json_name, g_sheet_name, sheet=''):
  225. '''imports google sheet
  226. :param :json file for login
  227. :param :g_sheet_name is the workbook name
  228. :param :sheet where the indexes are'''
  229. self.json_name = json_name
  230. scope = ['https://spreadsheets.google.com/feeds',
  231. 'https://www.googleapis.com/auth/drive']
  232. # creds = ServiceAccountCredentials.from_json_keyfile_name(json_name, scope)
  233. creds = Credentials.from_service_account_file(json_name, scopes=scope)
  234. gsheet = gspread.authorize(creds, client_class=BackoffClient)
  235. self.sheet = gsheet.open(g_sheet_name).sheet1
  236. # return sheet
  237. def cond_range(self, col1=1, col2=2):
  238. '''returns tuple for conditioning download
  239. :param col1, col2 are integers, the column numbers'''
  240. self.col1 = col1
  241. col1 = self.sheet.col_values(col1)[3:]
  242. self.col2 = col2
  243. col2 = self.sheet.col_values(col2)[3:]
  244. cond_range = zip(col1, col2)
  245. return cond_range
  246. def xling(self, xl_param, wsheet_param, ext_, renamer='n', path='',
  247. headers=['codfiscal', 'index', 'data_dep', 'data_inreg', 'tip', 'luna', 'anul', 'mesaj', 'id', 'rect']):
  248. '''
  249. :param :xl_param = Workbook name
  250. :param :w_sheet_param = worksheet name
  251. :param :headers = default or user defined'''
  252. output_data = xlsxwriter.Workbook(xl_param + '.xlsx')
  253. w_sheet = output_data.add_worksheet(wsheet_param)
  254. # my_file = open('test.pdf', 'rb')
  255. my_row = 0
  256. my_col = 0
  257. print('initiating headers: ', ', '.join(headers))
  258. hcol = 0
  259. print('')
  260. for h in headers:
  261. headformat = output_data.add_format({'bold': True})
  262. w_sheet.write(0, hcol, h, headformat)
  263. print('writing header in column', hcol + 1, 'name', h, end='\r', flush=True)
  264. hcol += 1
  265. for my_list in self.filter_files(ext_, path=path):
  266. '''
  267. paths = 1 - c:\\Users\\levente.marton\\git\\Hello-World\\omv\\Mzk\\
  268. 2 - c:\\Users\\conta1\\git\\Hello-World\\omv\\Mzk\\ 3 -c:\\Users\\timi\\git\\Hello-World\\omv\\Mzk\\
  269. '''
  270. # self.pdf_reader(my_list)
  271. my_dict1 = self.pdf_reader(path + my_list)
  272. for i in my_dict1:
  273. my_row += 1
  274. print('writing row %s' % my_row, end='\r', flush=True)
  275. w_sheet.write(my_row, my_col, i)
  276. w_sheet.write_row(my_row, 1, my_dict1[i])
  277. if renamer == 'y':
  278. self.pdf_renamer(path + my_list, i, my_dict1[i][3], my_dict1[i][4], my_dict1[i][5])
  279. print('%s.xlsx written & ready' % xl_param)
  280. format1 = output_data.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'})
  281. w_sheet.autofilter(0, 0, 100, 9)
  282. w_sheet.set_column(7, 7, 50)
  283. w_sheet.set_column(8, 8, 15)
  284. w_sheet.set_column(0, 6, 11)
  285. w_sheet.freeze_panes(1, 0)
  286. w_sheet.conditional_format('I2:I500', {'type': 'duplicate', 'format': format1})
  287. output_data.close()
  288. return my_row
  289. def save_message(self, ext_, col1, col2, offset, *args): # message=-1
  290. '''write receipt messages in column to google sheet'''
  291. # for ind, mess in self.cond_range(col1, col2):
  292. for ind, mess in tqdm(self.cond_range(col1, col2)):
  293. if mess == 'FALSE':
  294. for m_files in self.filter_files(ext_):
  295. for dummy in self.pdf_reader(m_files):
  296. if ind == self.get_index():
  297. print('{0} - {1}-{2}-{3}'.format(self.get_cod_fiscal(), self.get_index(), self.get_type(), self.get_message()))
  298. m_ind = 0
  299. # printProgressBar(0, len(args))
  300. for v in tqdm(args):
  301. # self.v = v
  302. for ind, mess in self.cond_range(col1, col2):
  303. if mess == 'FALSE':
  304. for m_files in self.filter_files(ext_):
  305. for dummy in self.pdf_reader(m_files):
  306. if ind == self.get_index():
  307. cell = self.sheet.find(self.get_index())
  308. self.sheet.update_cell(cell.row, col2 - offset[m_ind], v())
  309. # printProgressBar(e + 1, len(args))
  310. m_ind += 1
  311. if __name__ == '__main__':
  312. my_recipisa = Receipts()
  313. p = my_recipisa.pdf_reader('241707933_17259191_D300_1_2021.pdf')
  314. print(my_recipisa.year_)
  315. # my_recipisa.get_gspread('Pysheet26134-2daf66659e50.json', 'Recipisa Mozaik')
  316. # my_recipisa.r_downloader('.pdf', cond1=3, cond2=5)
  317. # my_recipisa.xling('recipise', 'recipise', '.pdf', 'y')
  318. # my_recipisa.save_message('.pdf', 3, 10, [1, 6, 8], my_recipisa.get_message, my_recipisa.get_type, my_recipisa.get_cod_fiscal)