调用百度证件识别 API 识别营业执照
1. 目标:
读取指定文件夹下的 营业执照照片 通过百度OCR识别API解析识别,并且把结果保存到本地
2. 思路:
读取指定文件夹下的所有图片
调用接口识别,获得识别结果
保存识别结果到 csv 文件中
3. 具体实现
AK 和 SK 需要注册百度云开发这账号,然后创建应用,领取证书 OCR 识别权益后,才能生效
保存 log_id 是为了出现意外后,可以通过接口获取已经生成的识别结果,不用重新调用 OCR 接口,消耗次数
import pprint
import requests
import base64
import os
import pandas as pd
def get_files(directory):
res = []
for root, dirs, files in os.walk(directory):
for file in files:
print(os.path.join(root, file))
res.append(os.path.join(root, file))
return res
class BaiduOcr:
AK = 'xxxxxxxxxxxxxxxxx'
SK = 'xxxxxxxxxxxxxxxxxxxxxxxxx'
def __init__(self):
self.access = ''
def get_baidu_access(self):
# client_id 为官网获取的AK, client_secret 为官网获取的SK
host = (f'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={self.AK}&'
f'client_secret={self.SK}')
response = requests.get(host)
if response:
access_token = response.json()['access_token']
print(access_token)
self.access = access_token
return access_token
def get_business_license_info(self, image_path: str):
''' 营业执照识别 '''
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
# 二进制方式打开图片文件
f = open(image_path, 'rb')
img = base64.b64encode(f.read())
params = {"image": img}
if not self.access:
self.get_baidu_access()
request_url = request_url + "?access_token=" + self.access
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
print(response.json())
return response.json()
def package_info(self, image_path: str):
image_name = image_path.split('\\')[-1].split('.')[0]
license_info = self.get_business_license_info(image_path)
if license_info:
log_id = license_info['log_id']
add_log_id(log_id)
return {
'log_id': log_id,
'图片名称': image_name,
'单位名称': license_info['words_result']['单位名称']['words'],
'地址': license_info['words_result']['地址']['words'],
'法人': license_info['words_result']['法人']['words'],
'社会信用代码': license_info['words_result']['社会信用代码']['words'],
'税务登记号': license_info['words_result']['税务登记号']['words'],
'类型': license_info['words_result']['类型']['words'],
'证件编号': license_info['words_result']['证件编号']['words'],
}
def add_log_id(log_id: str):
with open('log_id.txt', 'a') as f:
f.write(f'{log_id}\n')
def main():
image_path_list = get_files('图片')
bd_ocr = BaiduOcr()
f_res = []
for image_path in image_path_list:
t = bd_ocr.package_info(str(image_path))
if t:
f_res.append(t)
df = pd.DataFrame(f_res)
df.to_csv('商户信息.csv', index=False, encoding='utf-8-sig')
if __name__ == '__main__':
main()
版权声明:
本站所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自
有限进步!
喜欢就支持一下吧
打赏
微信
支付宝