pyocr测试
安装ocr软件
ubuntu 环境
git clone https://github.com/tesseract-ocr/tesseract.git apt-get install automake
安装 leptonica http://www.leptonica.org/source/leptonica-1.73.tar.gz
安装 pyocr pip install pyocr
测试
方法: 下载一个网站验证码,然后计算正确次数,
总体来就有 三分之一正确!
import requests
import shutil
import os
from PIL import Image
import sys
import pyocr
import pyocr.builders
tools = pyocr.get_available_tools()
if len(tools) == 0:
print("No OCR tool found")
sys.exit(1)
# The tools are returned in the recommended order of usage
tool = tools[0]
langs = tool.get_available_languages()
print("Available languages: %s" % ", ".join(langs))
lang = langs[0]
total = 10000
count = 0
headers = {}
f = open('headers.txt', 'r')
for line in f:
if ':' in line:
i = line.index(':')
k = line[:i]
v = line[i + 1:]
k = k.strip()
v = v.strip()
headers[k] = v
try:
os.mkdir('code')
except:
pass
path_tmp = os.path.join('code', '{}.jpeg')
for i in range(total):
path = path_tmp.format(i)
r = requests.get('https://plogin.m.jd.com/cgi-bin/m/authcode?mod=login',
headers=headers,
stream=True)
if r.status_code != 200:
continue
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
txt = tool.image_to_string(
Image.open(path), lang=lang,
builder=pyocr.builders.TextBuilder()
)
print(txt, len(txt))
if len(txt) == 4:
count += 1
print("{}%% can ocr".format(float(count) / total))