You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.5 KiB
Python
49 lines
1.5 KiB
Python
9 months ago
|
import time
|
||
|
|
||
|
from pdf2image import convert_from_path
|
||
|
from aip import AipOcr
|
||
|
import os, sys
|
||
|
|
||
|
# 需要安装 poppler brew install poppler
|
||
|
os.chdir(sys.path[0])
|
||
|
|
||
|
APP_ID = '41664067'
|
||
|
API_KEY = 'EVaKTX2gnEqHkt25vSO6c99h'
|
||
|
SECRET_KEY = 'SivvxYGGbVwaPaxRyxqxGdkW1xlXV3BF'
|
||
|
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
|
||
|
|
||
|
|
||
|
def format_number(number, width):
|
||
|
return f'{number:0{width}}'
|
||
|
def baidu_ocr(fname):
|
||
|
f = open('huanbu.txt', 'w', encoding='utf-8')
|
||
|
dirname = fname.rsplit('.', 1)[0]
|
||
|
if not os.path.exists(dirname):
|
||
|
os.mkdir(dirname)
|
||
|
# images = convert_from_path(fname, fmt='png', output_folder=dirname, poppler_path=r'/opt/homebrew/Cellar/poppler/23.10.0/bin')
|
||
|
# 图片文件如果要去掉前缀的话,使用下面的命令
|
||
|
# for i in $(ls); do echo $i && mv $i $(echo $i | sed 's/.*-\(.*\)/\1/');done
|
||
|
|
||
|
# images = os.listdir(dirname)
|
||
|
|
||
|
os.chdir(dirname)
|
||
|
|
||
|
for num in range(1, 331):
|
||
|
fnum = format_number(num, 3)
|
||
|
img = f'{fnum}.png'
|
||
|
print(img)
|
||
|
with open(img, 'rb') as fimg:
|
||
|
img = fimg.read() # 根据'PIL.PngImagePlugin.PngImageFile'对象的filename属性读取图片为二进制
|
||
|
msg = client.basicAccurate(img)
|
||
|
for i in msg.get('words_result'):
|
||
|
f.write('{}\n'.format(i.get('words')))
|
||
|
f.write('\f\n')
|
||
|
time.sleep(1)
|
||
|
print("write done")
|
||
|
f.close()
|
||
|
|
||
|
|
||
|
# Press the green button in the gutter to run the script.
|
||
|
if __name__ == '__main__':
|
||
|
baidu_ocr('huanbu.pdf')
|