import time from pdf2image import convert_from_path from aip import AipOcr import os, sys # 需要安装 poppler brew install poppler os.chdir(sys.path[0]) APP_ID = '41664067' API_KEY = 'EVaKTX2gnEqHkt25vSO6c99h' SECRET_KEY = 'SivvxYGGbVwaPaxRyxqxGdkW1xlXV3BF' client = AipOcr(APP_ID, API_KEY, SECRET_KEY) def format_number(number, width): return f'{number:0{width}}' def baidu_ocr(fname): f = open('huanbu.txt', 'w', encoding='utf-8') dirname = fname.rsplit('.', 1)[0] if not os.path.exists(dirname): os.mkdir(dirname) # images = convert_from_path(fname, fmt='png', output_folder=dirname, poppler_path=r'/opt/homebrew/Cellar/poppler/23.10.0/bin') # 图片文件如果要去掉前缀的话,使用下面的命令 # for i in $(ls); do echo $i && mv $i $(echo $i | sed 's/.*-\(.*\)/\1/');done # images = os.listdir(dirname) os.chdir(dirname) for num in range(1, 331): fnum = format_number(num, 3) img = f'{fnum}.png' print(img) with open(img, 'rb') as fimg: img = fimg.read() # 根据'PIL.PngImagePlugin.PngImageFile'对象的filename属性读取图片为二进制 msg = client.basicAccurate(img) for i in msg.get('words_result'): f.write('{}\n'.format(i.get('words'))) f.write('\f\n') time.sleep(1) print("write done") f.close() # Press the green button in the gutter to run the script. if __name__ == '__main__': baidu_ocr('huanbu.pdf')