1. 目标：

读取指定文件夹下的营业执照照片通过百度OCR识别API解析识别，并且把结果保存到本地

2. 思路：

读取指定文件夹下的所有图片
调用接口识别，获得识别结果
保存识别结果到 csv 文件中

3. 具体实现

AK 和 SK 需要注册百度云开发这账号，然后创建应用，领取证书 OCR 识别权益后，才能生效
保存 log_id 是为了出现意外后，可以通过接口获取已经生成的识别结果，不用重新调用 OCR 接口，消耗次数

import pprint

import requests
import base64
import os
import pandas as pd


def get_files(directory):
    res = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            print(os.path.join(root, file))
            res.append(os.path.join(root, file))
    return res


class BaiduOcr:
    AK = 'xxxxxxxxxxxxxxxxx'
    SK = 'xxxxxxxxxxxxxxxxxxxxxxxxx'

    def __init__(self):
        self.access = ''

    def get_baidu_access(self):
        # client_id 为官网获取的AK， client_secret 为官网获取的SK
        host = (f'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={self.AK}&'
                f'client_secret={self.SK}')
        response = requests.get(host)
        if response:
            access_token = response.json()['access_token']
            print(access_token)
            self.access = access_token
            return access_token

    def get_business_license_info(self, image_path: str):
        ''' 营业执照识别 '''
        request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
        # 二进制方式打开图片文件
        f = open(image_path, 'rb')
        img = base64.b64encode(f.read())
        params = {"image": img}
        if not self.access:
            self.get_baidu_access()
        request_url = request_url + "?access_token=" + self.access
        headers = {'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        if response:
            print(response.json())
            return response.json()

    def package_info(self, image_path: str):
        image_name = image_path.split('\\')[-1].split('.')[0]
        license_info = self.get_business_license_info(image_path)
        if license_info:
            log_id = license_info['log_id']
            add_log_id(log_id)
            return {
                'log_id': log_id,
                '图片名称': image_name,
                '单位名称': license_info['words_result']['单位名称']['words'],
                '地址': license_info['words_result']['地址']['words'],
                '法人': license_info['words_result']['法人']['words'],
                '社会信用代码': license_info['words_result']['社会信用代码']['words'],
                '税务登记号': license_info['words_result']['税务登记号']['words'],
                '类型': license_info['words_result']['类型']['words'],
                '证件编号': license_info['words_result']['证件编号']['words'],
            }


def add_log_id(log_id: str):
    with open('log_id.txt', 'a') as f:
        f.write(f'{log_id}\n')


def main():
    image_path_list = get_files('图片')
    bd_ocr = BaiduOcr()
    f_res = []
    for image_path in image_path_list:
        t = bd_ocr.package_info(str(image_path))
        if t:
            f_res.append(t)
    df = pd.DataFrame(f_res)
    df.to_csv('商户信息.csv', index=False, encoding='utf-8-sig')


if __name__ == '__main__':
    main()

调用百度证件识别 API 识别营业执照

http://localhost:28080/archives/376f86ee-57cf-454a-902a-40a6ce291af0

作者

Mark

发布于

2024-04-17

更新于

2024-09-02

许可

脚本/工具 Python 实用脚本