add converting pdf to image tutorial

x4nth055 · x4nth055 · commit bb354199adc3 · 2021-07-31T13:53:10.000+01:00
diff --git a/README.md b/README.md
@@ -96,6 +96,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [Highlighting Text in PDF with Python](https://www.thepythoncode.com/article/redact-and-highlight-text-in-pdf-with-python). ([code](handling-pdf-files/highlight-redact-text))
     - [How to Extract Text from Images in PDF Files with Python](https://www.thepythoncode.com/article/extract-text-from-images-or-scanned-pdf-python). ([code](handling-pdf-files/pdf-ocr))
     - [How to Convert PDF to Docx in Python](https://www.thepythoncode.com/article/convert-pdf-files-to-docx-in-python). ([code](handling-pdf-files/convert-pdf-to-docx))
+    - [How to Convert PDF to Images in Python](https://www.thepythoncode.com/article/convert-pdf-files-to-images-in-python). ([code](handling-pdf-files/convert-pdf-to-image))
     
     
 - ### [Web Scraping](https://www.thepythoncode.com/topic/web-scraping)
diff --git a/handling-pdf-files/convert-pdf-to-image/README.md b/handling-pdf-files/convert-pdf-to-image/README.md
@@ -0,0 +1,7 @@
+# [How to Convert PDF to Images in Python](https://www.thepythoncode.com/article/convert-pdf-files-to-images-in-python)
+To run this:
+- `pip3 install -r requirements.txt`
+- To convert the PDF file `bert-paper.pdf` into several images (image per page):
+    ```
+    $ python convert_pdf2image.py bert-paper.pdf
+    ```
diff --git a/handling-pdf-files/convert-pdf-to-image/bert-paper.pdf b/handling-pdf-files/convert-pdf-to-image/bert-paper.pdf
diff --git a/handling-pdf-files/convert-pdf-to-image/convert_pdf2image.py b/handling-pdf-files/convert-pdf-to-image/convert_pdf2image.py
@@ -0,0 +1,48 @@
+import fitz
+
+from typing import Tuple
+import os
+
+
+def convert_pdf2img(input_file: str, pages: Tuple = None):
+    """Converts pdf to image and generates a file by page"""
+    # Open the document
+    pdfIn = fitz.open(input_file)
+    output_files = []
+    # Iterate throughout the pages
+    for pg in range(pdfIn.pageCount):
+        if str(pages) != str(None):
+            if str(pg) not in str(pages):
+                continue
+        # Select a page
+        page = pdfIn[pg]
+        rotate = int(0)
+        # PDF Page is converted into a whole picture 1056*816 and then for each picture a screenshot is taken.
+        # zoom = 1.33333333 -----> Image size = 1056*816
+        # zoom = 2 ---> 2 * Default Resolution (text is clear, image text is hard to read)    = filesize small / Image size = 1584*1224
+        # zoom = 4 ---> 4 * Default Resolution (text is clear, image text is barely readable) = filesize large
+        # zoom = 8 ---> 8 * Default Resolution (text is clear, image text is readable) = filesize large
+        zoom_x = 2
+        zoom_y = 2
+        # The zoom factor is equal to 2 in order to make text clear
+        # Pre-rotate is to rotate if needed.
+        mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
+        pix = page.getPixmap(matrix=mat, alpha=False)
+        output_file = f"{os.path.splitext(os.path.basename(input_file))[0]}_page{pg+1}.png"
+        pix.writePNG(output_file)
+        output_files.append(output_file)
+    pdfIn.close()
+    summary = {
+        "File": input_file, "Pages": str(pages), "Output File(s)": str(output_files)
+    }
+    # Printing Summary
+    print("## Summary ########################################################")
+    print("\n".join("{}:{}".format(i, j) for i, j in summary.items()))
+    print("###################################################################")
+    return output_files
+
+
+if __name__ == "__main__":
+    import sys
+    input_file = sys.argv[1]
+    convert_pdf2img(input_file)
diff --git a/handling-pdf-files/convert-pdf-to-image/requirements.txt b/handling-pdf-files/convert-pdf-to-image/requirements.txt
@@ -0,0 +1 @@
+PyMuPDF==1.18.9