python-使用百度云api识别发票并录入EXCEL

发布于 2021-04-29 07:42

修改发票的名称和录入信息特别烦琐,百度了这个问题,发现有大神已经有类似问题的实现,在他的基础上修改了下,实现了对目录下的pdf及图片格式的识别,并统一修改命名为发票代码+发票号码的方式

代码如下

# -*- coding: utf-8 -*-"""Created on Tue Apr 28 22:36:58 2021"""
import datetimeimport globimport osimport sysimport fitzimport numpy as npimport pandas as pdfrom aip import AipOcr
######发票字段######dict_vat_invoice = {'Agent': '是否代开', 'AmountInFiguers': '价税合计(小写)', 'AmountInWords': '价税合计(大写)', 'CheckCode': '校验码', 'Checker': '复核', 'InvoiceCode': '发票代码', 'InvoiceDate': '开票日期', 'InvoiceNum': '发票号码', 'InvoiceType': '发票种类', 'InvoiceTypeOrg': '发票名称', 'MachineCode': '机器编号', 'NoteDrawer': '开票人', 'Password': '密码区', 'Payee': '收款人', 'PurchaserAddress': '购方地址及电话', 'PurchaserBank': '购方开户行及账号', 'PurchaserName': '购方名称', 'PurchaserRegisterNum': '购方纳税人识别号', 'SellerAddress': '销售方地址及电话', 'SellerBank': '销售方开户行及账号', 'SellerName': '销售方名称', 'SellerRegisterNum': '销售方纳税人识别号', 'TotalAmount': '合计金额', 'TotalTax': '合计税额'}
dict_invoice = {'InvoiceCode':'发票代码', 'InvoiceNum':'发票号码', 'InvoiceDate':'开票日期', 'AmountInFiguers':'价税合计(小写)', 'AmountInWords':'价税合计(大写)', 'MachineNum':'机器编号', 'CheckCode':'校验码', 'SellerName':'销售方名称', 'SellerRegisterNum':'销售方纳税人识别号', 'PurchaserName':'购方名称', 'PurchaserRegisterNum':'购方纳税人识别号', 'TotalTax':'合计税额'}

######百度API配置######
config = {'appId': '********', 'apiKey': 'FbcPh*********************',          'secretKey''ir3AhXhS*********************' }client = AipOcr(**config)def get_file_content(file): with open(file, 'rb') as fp: return fp.read()
####发票图片识别#####def image_ocr_inv(image_path): print(image_path) image = get_file_content(image_path) ocr_result = client.invoice(image)['words_result'] print(ocr_result['InvoiceType']) if ocr_result['InvoiceType'] == '河北增值税普通发票(卷票)': table_infor = pd.DataFrame.from_dict(ocr_result, orient='index') table_infor_T = pd.DataFrame(table_infor.values.T, columns=table_infor.index) table_infor_T.rename(columns=dict_invoice, inplace=True) table_infor_T = table_infor_T[dict_invoice.values()] elif ocr_result['InvoiceType'] in(['河北增值税普通发票','河北增值税电子普通发票']): ocr_result = client.vatInvoice(image)['words_result'] if ocr_result['AmountInFiguers'] != '': table_infor = pd.DataFrame.from_dict(ocr_result, orient='index') table_infor_T = pd.DataFrame(table_infor.values.T, columns=table_infor.index) table_infor_T.rename(columns=dict_invoice, inplace=True) table_infor_T.rename(columns=dict_vat_invoice, inplace=True) table_infor_T = table_infor_T[dict_vat_invoice.values()] else: print('图片不清晰') return elif ocr_result['InvoiceType'] == '河北通用机打发票': print('出租车票暂无法识别') return else: print('图片不清晰') return return table_infor_T

def pdf2image(pdfpath,image_path): with fitz.open(pdfpath) as pdfDoc: for pg in range(pdfDoc.pageCount): page = pdfDoc[pg] rotate = int(0) zoom_x = 3 zoom_y = 3 mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) pix = page.getPixmap(matrix=mat, alpha=False) pix.writePNG(image_path) # 将图片写入指定的文件夹内 print('转化图片完成')
source_dir = "C:/Users/新建文件夹/"exts = ['*/*.pdf', '*/*.jpg', '*/*.png']dir_list = [f for ext in exts for f in glob.glob(source_dir + ext)]
inv_data_list = []for old_file in dir_list: print(old_file) fullname, ext = os.path.splitext(old_file) filepath = os.path.dirname(old_file) if ext == ".pdf": image_dir = fullname+".png" pdf2image(old_file,image_dir) inv_data = image_ocr_inv(image_dir) os.remove(image_dir) elif ext in([".jpg",".png"]): inv_data = image_ocr_inv(old_file) if inv_data is not None: new_file = filepath + '/{}-{}{}'.format(*(inv_data[['发票代码','发票号码']].values.tolist()[0]), ext) os.rename(old_file, new_file) inv_data_list.append(inv_data)data_result = pd.concat(inv_data_list) # 合并data_result.drop_duplicates(inplace=True)

with pd.ExcelWriter('E:/发票.xlsx') as writer: data_result.to_excel(writer, '汇总', index=False)
实现效果

还存在的问题:
  1. 如果有重复的发票,会因为文件存在而无法命名

  2. 如果图片不清晰,河北增值税普通发票(卷票)可能会因为识别不出来卷票两个字导致调用接口不对,识别不出来