You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
36 lines
939 B
Python
36 lines
939 B
Python
from PIL import Image
|
|
import pytesseract
|
|
import re
|
|
from loguru import logger
|
|
|
|
'''
|
|
下载语言包
|
|
wget https://github.com/tesseract-ocr/tessdata/raw/main/chi_sim.traineddata
|
|
mkdir tessdata
|
|
mv chi_sim.traineddata tessdata
|
|
export TESSDATA_PREFIX=tessdata
|
|
'''
|
|
|
|
logger.add('huanbu.log', rotation='100 KB', level='DEBUG', compression='tar.gz')
|
|
logger.debug("That's it, beautiful and simple logging!")
|
|
def format_number(number, width):
|
|
return f'{number:0{width}}'
|
|
|
|
|
|
def extract_text(image_path):
|
|
image = Image.open(image_path)
|
|
text = pytesseract.image_to_string(image, lang='chi_sim')
|
|
text_without_spaces = re.sub(r'\s', '', text)
|
|
return text_without_spaces
|
|
|
|
|
|
if __name__ == "__main__":
|
|
imgpath='./huanbu/'
|
|
for num in range(1, 331):
|
|
fnum = format_number(num, 3)
|
|
img = f'{imgpath}{fnum}.png'
|
|
|
|
extracted_text = extract_text(img)
|
|
print(extracted_text)
|
|
logger.info(extracted_text)
|