add highlighting & redacting text in pdf tutorial

x4nth055 · x4nth055 · commit 2b082138c30d · 2021-07-23T00:08:28.000+01:00
diff --git a/README.md b/README.md
@@ -93,6 +93,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [How to Change Text Color in Python](https://www.thepythoncode.com/article/change-text-color-in-python). ([code](general/printing-in-colors))
     - [How to Create a Watchdog in Python](https://www.thepythoncode.com/article/create-a-watchdog-in-python). ([code](general/directory-watcher))
     - [How to Watermark PDF Files in Python](https://www.thepythoncode.com/article/watermark-in-pdf-using-python). ([code](general/add-watermark-pdf))
+    - [Highlighting Text in PDF with Python](https://www.thepythoncode.com/article/redact-and-highlight-text-in-pdf-with-python). ([code](handling-pdf-files/highlight-redact-text))
     
     
 - ### [Web Scraping](https://www.thepythoncode.com/topic/web-scraping)
diff --git a/handling-pdf-files/highlight-redact-text/README.md b/handling-pdf-files/highlight-redact-text/README.md
@@ -0,0 +1,20 @@
+# [Highlighting Text in PDF with Python](https://www.thepythoncode.com/article/redact-and-highlight-text-in-pdf-with-python)
+To run this:
+- `pip3 install -r requirements.txt`
+- 
+    ```python pdf_highlighter.py --help```
+    **Output:**
+    ```
+    usage: pdf_highlighter.py [-h] -i INPUT_PATH [-a {Redact,Frame,Highlight,Squiggly,Underline,Strikeout,Remove}] [-p PAGES]
+
+    Available Options
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    -i INPUT_PATH, --input_path INPUT_PATH
+                            Enter the path of the file or the folder to process
+    -a {Redact,Frame,Highlight,Squiggly,Underline,Strikeout,Remove}, --action {Redact,Frame,Highlight,Squiggly,Underline,Strikeout,Remove}
+                            Choose whether to Redact or to Frame or to Highlight or to Squiggly or to Underline or to Strikeout or to Remove
+    -p PAGES, --pages PAGES
+                            Enter the pages to consider e.g.: [2,4]
+    ```
diff --git a/handling-pdf-files/highlight-redact-text/bert-paper.pdf b/handling-pdf-files/highlight-redact-text/bert-paper.pdf
diff --git a/handling-pdf-files/highlight-redact-text/pdf_highlighter.py b/handling-pdf-files/highlight-redact-text/pdf_highlighter.py
@@ -0,0 +1,298 @@
+# Import Libraries
+from typing import Tuple
+from io import BytesIO
+import os
+import argparse
+import re
+import fitz
+
+
+def extract_info(input_file: str):
+    """
+    Extracts file info
+    """
+    # Open the PDF
+    pdfDoc = fitz.open(input_file)
+    output = {
+        "File": input_file, "Encrypted": ("True" if pdfDoc.isEncrypted else "False")
+    }
+    # If PDF is encrypted the file metadata cannot be extracted
+    if not pdfDoc.isEncrypted:
+        for key, value in pdfDoc.metadata.items():
+            output[key] = value
+
+    # To Display File Info
+    print("## File Information ##################################################")
+    print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
+    print("######################################################################")
+
+    return True, output
+
+
+def search_for_text(lines, search_str):
+    """
+    Search for the search string within the document lines
+    """
+    for line in lines:
+        # Find all matches within one line
+        results = re.findall(search_str, line, re.IGNORECASE)
+        # In case multiple matches within one line
+        for result in results:
+            yield result
+
+
+def redact_matching_data(page, matched_values):
+    """
+    Redacts matching values
+    """
+    matches_found = 0
+    # Loop throughout matching values
+    for val in matched_values:
+        matches_found += 1
+        matching_val_area = page.searchFor(val)
+        # Redact matching values
+        [page.addRedactAnnot(area, text=" ", fill=(0, 0, 0))
+         for area in matching_val_area]
+    # Apply the redaction
+    page.apply_redactions()
+    return matches_found
+
+
+def frame_matching_data(page, matched_values):
+    """
+    frames matching values
+    """
+    matches_found = 0
+    # Loop throughout matching values
+    for val in matched_values:
+        matches_found += 1
+        matching_val_area = page.searchFor(val)
+        for area in matching_val_area:
+            if isinstance(area, fitz.fitz.Rect):
+                # Draw a rectangle around matched values
+                annot = page.addRectAnnot(area)
+                # , fill = fitz.utils.getColor('black')
+                annot.setColors(stroke=fitz.utils.getColor('red'))
+                # If you want to remove matched data
+                #page.addFreetextAnnot(area, ' ')
+                annot.update()
+    return matches_found
+
+
+def highlight_matching_data(page, matched_values, type):
+    """
+    Highlight matching values
+    """
+    matches_found = 0
+    # Loop throughout matching values
+    for val in matched_values:
+        matches_found += 1
+        matching_val_area = page.searchFor(val)
+        # print("matching_val_area",matching_val_area)
+        highlight = None
+        if type == 'Highlight':
+            highlight = page.addHighlightAnnot(matching_val_area)
+        elif type == 'Squiggly':
+            highlight = page.addSquigglyAnnot(matching_val_area)
+        elif type == 'Underline':
+            highlight = page.addUnderlineAnnot(matching_val_area)
+        elif type == 'Strikeout':
+            highlight = page.addStrikeoutAnnot(matching_val_area)
+        else:
+            highlight = page.addHighlightAnnot(matching_val_area)
+        # To change the highlight colar
+        # highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) })
+        # highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red'))
+        # highlight.setColors(colors= fitz.utils.getColor('red'))
+        highlight.update()
+    return matches_found
+
+
+def process_data(input_file: str, output_file: str, search_str: str, pages: Tuple = None, action: str = 'Highlight'):
+    """
+    Process the pages of the PDF File
+    """
+    # Open the PDF
+    pdfDoc = fitz.open(input_file)
+    # Save the generated PDF to memory buffer
+    output_buffer = BytesIO()
+    total_matches = 0
+    # Iterate through pages
+    for pg in range(pdfDoc.pageCount):
+        # If required for specific pages
+        if pages:
+            if str(pg) not in pages:
+                continue
+        # Select the page
+        page = pdfDoc[pg]
+        # Get Matching Data
+        # Split page by lines
+        page_lines = page.getText("text").split('\n')
+        matched_values = search_for_text(page_lines, search_str)
+        if matched_values:
+            if action == 'Redact':
+                matches_found = redact_matching_data(page, matched_values)
+            elif action == 'Frame':
+                matches_found = frame_matching_data(page, matched_values)
+            elif action in ('Highlight', 'Squiggly', 'Underline', 'Strikeout'):
+                matches_found = highlight_matching_data(
+                    page, matched_values, action)
+            else:
+                matches_found = highlight_matching_data(
+                    page, matched_values, 'Highlight')
+            total_matches += matches_found
+    print(f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}")
+    # Save to output
+    pdfDoc.save(output_buffer)
+    pdfDoc.close()
+    # Save the output buffer to the output file
+    with open(output_file, mode='wb') as f:
+        f.write(output_buffer.getbuffer())
+
+
+def remove_highlght(input_file: str, output_file: str, pages: Tuple = None):
+    # Open the PDF
+    pdfDoc = fitz.open(input_file)
+    # Save the generated PDF to memory buffer
+    output_buffer = BytesIO()
+    # Initialize a counter for annotations
+    annot_found = 0
+    # Iterate through pages
+    for pg in range(pdfDoc.pageCount):
+        # If required for specific pages
+        if pages:
+            if str(pg) not in pages:
+                continue
+        # Select the page
+        page = pdfDoc[pg]
+        annot = page.firstAnnot
+        while annot:
+            annot_found += 1
+            page.deleteAnnot(annot)
+            annot = annot.next
+    if annot_found >= 0:
+        print(f"Annotation(s) Found In The Input File: {input_file}")
+    # Save to output
+    pdfDoc.save(output_buffer)
+    pdfDoc.close()
+    # Save the output buffer to the output file
+    with open(output_file, mode='wb') as f:
+        f.write(output_buffer.getbuffer())
+
+
+
+def process_file(**kwargs):
+    """
+    To process one single file
+    Redact, Frame, Highlight... one PDF File
+    Remove Highlights from a single PDF File
+    """
+    input_file = kwargs.get('input_file')
+    output_file = kwargs.get('output_file')
+    if output_file is None:
+        output_file = input_file
+    search_str = kwargs.get('search_str')
+    pages = kwargs.get('pages')
+    # Redact, Frame, Highlight, Squiggly, Underline, Strikeout, Remove
+    action = kwargs.get('action')
+    if action == "Remove":
+        # Remove the Highlights except Redactions
+        remove_highlght(input_file=input_file,
+                        output_file=output_file, pages=pages)
+    else:
+        process_data(input_file=input_file, output_file=output_file,
+                     search_str=search_str, pages=pages, action=action)
+
+
+def process_folder(**kwargs):
+    """
+    Redact, Frame, Highlight... all PDF Files within a specified path
+    Remove Highlights from all PDF Files within a specified path
+    """
+    input_folder = kwargs.get('input_folder')
+    search_str = kwargs.get('search_str')
+    # Run in recursive mode
+    recursive = kwargs.get('recursive')
+    #Redact, Frame, Highlight, Squiggly, Underline, Strikeout, Remove
+    action = kwargs.get('action')
+    pages = kwargs.get('pages')
+    # Loop though the files within the input folder.
+    for foldername, dirs, filenames in os.walk(input_folder):
+        for filename in filenames:
+            # Check if pdf file
+            if not filename.endswith('.pdf'):
+                continue
+             # PDF File found
+            inp_pdf_file = os.path.join(foldername, filename)
+            print("Processing file =", inp_pdf_file)
+            process_file(input_file=inp_pdf_file, output_file=None,
+                         search_str=search_str, action=action, pages=pages)
+        if not recursive:
+            break
+
+
+def is_valid_path(path):
+    """
+    Validates the path inputted and checks whether it is a file path or a folder path
+    """
+    if not path:
+        raise ValueError(f"Invalid Path")
+    if os.path.isfile(path):
+        return path
+    elif os.path.isdir(path):
+        return path
+    else:
+        raise ValueError(f"Invalid Path {path}")
+
+
+def parse_args():
+    """
+    Get user command line parameters
+    """
+    parser = argparse.ArgumentParser(description="Available Options")
+    parser.add_argument('-i', '--input_path', dest='input_path', type=is_valid_path,
+                        required=True, help="Enter the path of the file or the folder to process")
+    parser.add_argument('-a', '--action', dest='action', choices=['Redact', 'Frame', 'Highlight', 'Squiggly', 'Underline', 'Strikeout', 'Remove'], type=str,
+                        default='Highlight', help="Choose whether to Redact or to Frame or to Highlight or to Squiggly or to Underline or to Strikeout or to Remove")
+    parser.add_argument('-p', '--pages', dest='pages', type=tuple,
+                        help="Enter the pages to consider e.g.: [2,4]")
+    action = parser.parse_known_args()[0].action
+    if action != 'Remove':
+        parser.add_argument('-s', '--search_str', dest='search_str'                            # lambda x: os.path.has_valid_dir_syntax(x)
+                            , type=str, required=True, help="Enter a valid search string")
+    path = parser.parse_known_args()[0].input_path
+    if os.path.isfile(path):
+        parser.add_argument('-o', '--output_file', dest='output_file', type=str  # lambda x: os.path.has_valid_dir_syntax(x)
+                            , help="Enter a valid output file")
+    if os.path.isdir(path):
+        parser.add_argument('-r', '--recursive', dest='recursive', default=False, type=lambda x: (
+            str(x).lower() in ['true', '1', 'yes']), help="Process Recursively or Non-Recursively")
+    args = vars(parser.parse_args())
+    # To Display The Command Line Arguments
+    print("## Command Arguments #################################################")
+    print("\n".join("{}:{}".format(i, j) for i, j in args.items()))
+    print("######################################################################")
+    return args
+
+
+if __name__ == '__main__':
+    # Parsing command line arguments entered by user
+    args = parse_args()
+    # If File Path
+    if os.path.isfile(args['input_path']):
+        # Extracting File Info
+        extract_info(input_file=args['input_path'])
+        # Process a file
+        process_file(
+            input_file=args['input_path'], output_file=args['output_file'], 
+            search_str=args['search_str'] if 'search_str' in (args.keys()) else None, 
+            pages=args['pages'], action=args['action']
+        )
+    # If Folder Path
+    elif os.path.isdir(args['input_path']):
+        # Process a folder
+        process_folder(
+            input_folder=args['input_path'], 
+            search_str=args['search_str'] if 'search_str' in (args.keys()) else None, 
+            action=args['action'], pages=args['pages'], recursive=args['recursive']
+        )
diff --git a/handling-pdf-files/highlight-redact-text/requirements.txt b/handling-pdf-files/highlight-redact-text/requirements.txt
@@ -0,0 +1 @@
+PyMuPDF==1.18.9