You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

84 lines
2.5 KiB

import PyPDF2
import unicodedata
import os
import re
def create_subfolder_if_not_exists():
subfolder_name = "notasProcessadas"
if not os.path.isdir(subfolder_name):
os.mkdir(subfolder_name)
def extract_num_nota(filename):
start = filename.rfind("_")
end = filename.find(".pdf")
return filename[start+1:end]
def is_pdf(filename):
return filename[-4:] == ".pdf"
def filter_interesting_lines(lines_array):
lines_array_res = []
flag = False
for line in lines_array:
if (line.find("TOMADOR DE SERVI") > -1):
flag = True
elif line.find("NOTA FISCAL DE SERVI") > -1:
flag = False
if flag:
lines_array_res.append(line)
return lines_array_res
def condition_nome_razao(input_str):
input_str = input_str.upper()
return input_str.find("NOME/RAZ") > -1
def format_nome_razao(input_str):
input_str = input_str[input_str.find(":")+1:].strip().replace(' ', '_')
input_str = unicodedata.normalize('NFKD', input_str).encode('ascii', 'ignore').decode('ascii')
input_str = input_str.upper()
return input_str[:40]
def condition_cpf_cnpj(input_str):
return input_str.find("CPF/CNPJ") > -1
def string_to_numeric(input_str):
res = ""
for char in input_str:
if char.isnumeric():
res += char
return res
def format_cpf_cnpj(input_str):
pattern=re.search(r'\d{2}\.\d{3}\.\d{3}\/\d{4}\-\d{2}', input_str)
if pattern != None:
return string_to_numeric(input_str[pattern.start():pattern.end()])
else:
pattern=re.search(r'\d{3}\.\d{3}\.\d{3}\-\d{2}', input_str)
if pattern != None:
return string_to_numeric(input_str[pattern.start():pattern.end()])
def main():
create_subfolder_if_not_exists()
dir_processadas = "notasProcessadas"
filenames = os.listdir()
for filename in filenames:
if (is_pdf(filename)):
reader = PyPDF2.PdfReader(filename)
interesting_string_page1 = filter_interesting_lines(reader.pages[0].extract_text().split('\n'))
num_nota = extract_num_nota(filename)
for line in interesting_string_page1:
if (condition_nome_razao(line)):
nome_razao = format_nome_razao(line)
elif condition_cpf_cnpj(line):
cpf_cnpj = format_cpf_cnpj(line)
newFilename = nome_razao + "_" + cpf_cnpj + "_" + num_nota + ".pdf"
newPlace = os.path.join(dir_processadas, newFilename)
os.rename(filename, newPlace)
main()