Just for documentation reasons, here is an example of OCR
using tesseract
and pdf2image
to extract text from an image pdf.
import pdf2image
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
def pdf_to_img(pdf_file):
return pdf2image.convert_from_path(pdf_file)
def ocr_core(file):
text = pytesseract.image_to_string(file)
return text
def print_pages(pdf_file):
images = pdf_to_img(pdf_file)
for pg, img in enumerate(images):
print(ocr_core(img))
print_pages('sample.pdf')