pyanything/misc/baidu_ocr_pdf2txt.py

import time

from pdf2image import convert_from_path
from aip import AipOcr
import os, sys

# 需要安装 poppler  brew install poppler
os.chdir(sys.path[0])

APP_ID = '41664067'
API_KEY = 'EVaKTX2gnEqHkt25vSO6c99h'
SECRET_KEY = 'SivvxYGGbVwaPaxRyxqxGdkW1xlXV3BF'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)


def format_number(number, width):
    return f'{number:0{width}}'
def baidu_ocr(fname):
    f = open('huanbu.txt', 'w', encoding='utf-8')
    dirname = fname.rsplit('.', 1)[0]
    if not os.path.exists(dirname):
        os.mkdir(dirname)
    # images = convert_from_path(fname, fmt='png', output_folder=dirname, poppler_path=r'/opt/homebrew/Cellar/poppler/23.10.0/bin')
    # 图片文件如果要去掉前缀的话,使用下面的命令
    # for i in $(ls); do echo $i && mv $i $(echo $i | sed 's/.*-\(.*\)/\1/');done

    # images = os.listdir(dirname)

    os.chdir(dirname)

    for num in range(1, 331):
        fnum = format_number(num, 3)
        img = f'{fnum}.png'
        print(img)
        with open(img, 'rb') as fimg:
           img = fimg.read()  # 根据'PIL.PngImagePlugin.PngImageFile'对象的filename属性读取图片为二进制
        msg = client.basicAccurate(img)
        for i in msg.get('words_result'):
           f.write('{}\n'.format(i.get('words')))
        f.write('\f\n')
        time.sleep(1)
    print("write done")
    f.close()


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    baidu_ocr('huanbu.pdf')