You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
1.5 KiB
Python

import time
from pdf2image import convert_from_path
from aip import AipOcr
import os, sys
# 需要安装 poppler brew install poppler
os.chdir(sys.path[0])
APP_ID = '41664067'
API_KEY = 'EVaKTX2gnEqHkt25vSO6c99h'
SECRET_KEY = 'SivvxYGGbVwaPaxRyxqxGdkW1xlXV3BF'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
def format_number(number, width):
return f'{number:0{width}}'
def baidu_ocr(fname):
f = open('huanbu.txt', 'w', encoding='utf-8')
dirname = fname.rsplit('.', 1)[0]
if not os.path.exists(dirname):
os.mkdir(dirname)
# images = convert_from_path(fname, fmt='png', output_folder=dirname, poppler_path=r'/opt/homebrew/Cellar/poppler/23.10.0/bin')
# 图片文件如果要去掉前缀的话,使用下面的命令
# for i in $(ls); do echo $i && mv $i $(echo $i | sed 's/.*-\(.*\)/\1/');done
# images = os.listdir(dirname)
os.chdir(dirname)
for num in range(1, 331):
fnum = format_number(num, 3)
img = f'{fnum}.png'
print(img)
with open(img, 'rb') as fimg:
img = fimg.read() # 根据'PIL.PngImagePlugin.PngImageFile'对象的filename属性读取图片为二进制
msg = client.basicAccurate(img)
for i in msg.get('words_result'):
f.write('{}\n'.format(i.get('words')))
f.write('\f\n')
time.sleep(1)
print("write done")
f.close()
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
baidu_ocr('huanbu.pdf')