I've found some guides online on how to make a PDF searchable if it was scanned. However, I'm currently struggling with figuring out how to do it for a multipage PDF.
My code takes multipaged PDFs, converts each page into a JPG, runs OCR on each page and then converts it into a PDF. However, only the last page is returned.
import pytesseract
from pdf2image import convert_from_path
pytesseract.pytesseract.tesseract_cmd = 'directory'
TESSDATA_PREFIX = 'directory'
tessdata_dir_config = '--tessdata-dir directory'
# Path of the pdf
PDF_file = r"pdf directory"
  
  
def pdf_text():
    
    # Store all the pages of the PDF in a variable
    pages = convert_from_path(PDF_file, 500)
  
    image_counter = 1
    for page in pages:
        # Declare file names
        filename = "page_"+str(image_counter)+".jpg"
        # Save the image of the page in system
        page.save(filename, 'JPEG')
        # Increment the counter to update filename
        image_counter = image_counter + 1
    # Variable to get count of total number of pages
    filelimit = image_counter-1
    outfile = "out_text.pdf"
    # Open the file in append mode so that all contents of all images are added to the same file
    
    f = open(outfile, "a")
    # Iterate from 1 to total number of pages
    for i in range(1, filelimit + 1):
        filename = "page_"+str(i)+".jpg"
        # Recognize the text as string in image using pytesseract
        result =  pytesseract.image_to_pdf_or_hocr(filename, lang="eng", config=tessdata_dir_config) 
            
        f = open(outfile, "w+b")
        f.write(bytearray(result))
        f.close()
pdf_text()
How can I run this for all pages and output one merged PDF?
 
     
    