Skip to content

PDF Loader


The RapidOCRPDFLoader class extends UnstructuredFileLoader and is designed to load and process PDF files using Optical Character Recognition (OCR).

Input parameters:

file_path: The path of the PDF file to be loaded.

Methods:

_get_elements: Extracts text and images from the PDF using OCR. It iterates through each page, retrieves text, processes images with OCR, and returns the structured text.

pdftext(nested function): Handles the actual text extraction and OCR processing for each page of the PDF.

Return value:

Returns a list of structured text documents after processing the PDF.

python
class RapidOCRPDFLoader(UnstructuredFileLoader):
    def _get_elements(self) -> List:
        def pdftext(filepath):

            ocr = RapidOCR()
            doc = fitz.open(filepath)
            expt = ""

            unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
            for i, page in enumerate(doc):

                unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
                # Immediatly display progress bar update results
                unit.refresh()

                text = page.get_text("")
                expt += text + "\n"

                img_list = page.get_images()
                for img in img_list:
                    pix = fitz.Pixmap(doc, img[0])
                    img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
                    result, _ = ocr(img_array)
                    if result:
                        ocr_result = [line[1] for line in result]
                        expt += "\n".join(ocr_result)

                # update progress
                unit.update(1)
            return expt

        text = pdftext(self.file_path)
        text = text.replace('\n', ' ')  # Remove the line breaks
        from unstructured.partition.text import partition_text
        return partition_text(text=text, **self.unstructured_kwargs)


if __name__ == "__main__":
    loader = RapidOCRPDFLoader(file_path=r"...")  # Change ... to the absolute path of the file
    docs = loader.load()
    print(docs)

Developed by XJTLU-Software 2024