Skip to content

Commit 2b08213

Browse files
committed
add highlighting & redacting text in pdf tutorial
1 parent 6dbeccb commit 2b08213

File tree

5 files changed

+320
-0
lines changed

5 files changed

+320
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
9393
- [How to Change Text Color in Python](https://www.thepythoncode.com/article/change-text-color-in-python). ([code](general/printing-in-colors))
9494
- [How to Create a Watchdog in Python](https://www.thepythoncode.com/article/create-a-watchdog-in-python). ([code](general/directory-watcher))
9595
- [How to Watermark PDF Files in Python](https://www.thepythoncode.com/article/watermark-in-pdf-using-python). ([code](general/add-watermark-pdf))
96+
- [Highlighting Text in PDF with Python](https://www.thepythoncode.com/article/redact-and-highlight-text-in-pdf-with-python). ([code](handling-pdf-files/highlight-redact-text))
9697

9798

9899
- ### [Web Scraping](https://www.thepythoncode.com/topic/web-scraping)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# [Highlighting Text in PDF with Python](https://www.thepythoncode.com/article/redact-and-highlight-text-in-pdf-with-python)
2+
To run this:
3+
- `pip3 install -r requirements.txt`
4+
-
5+
```python pdf_highlighter.py --help```
6+
**Output:**
7+
```
8+
usage: pdf_highlighter.py [-h] -i INPUT_PATH [-a {Redact,Frame,Highlight,Squiggly,Underline,Strikeout,Remove}] [-p PAGES]
9+
10+
Available Options
11+
12+
optional arguments:
13+
-h, --help show this help message and exit
14+
-i INPUT_PATH, --input_path INPUT_PATH
15+
Enter the path of the file or the folder to process
16+
-a {Redact,Frame,Highlight,Squiggly,Underline,Strikeout,Remove}, --action {Redact,Frame,Highlight,Squiggly,Underline,Strikeout,Remove}
17+
Choose whether to Redact or to Frame or to Highlight or to Squiggly or to Underline or to Strikeout or to Remove
18+
-p PAGES, --pages PAGES
19+
Enter the pages to consider e.g.: [2,4]
20+
```
Binary file not shown.
Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
# Import Libraries
2+
from typing import Tuple
3+
from io import BytesIO
4+
import os
5+
import argparse
6+
import re
7+
import fitz
8+
9+
10+
def extract_info(input_file: str):
11+
"""
12+
Extracts file info
13+
"""
14+
# Open the PDF
15+
pdfDoc = fitz.open(input_file)
16+
output = {
17+
"File": input_file, "Encrypted": ("True" if pdfDoc.isEncrypted else "False")
18+
}
19+
# If PDF is encrypted the file metadata cannot be extracted
20+
if not pdfDoc.isEncrypted:
21+
for key, value in pdfDoc.metadata.items():
22+
output[key] = value
23+
24+
# To Display File Info
25+
print("## File Information ##################################################")
26+
print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
27+
print("######################################################################")
28+
29+
return True, output
30+
31+
32+
def search_for_text(lines, search_str):
33+
"""
34+
Search for the search string within the document lines
35+
"""
36+
for line in lines:
37+
# Find all matches within one line
38+
results = re.findall(search_str, line, re.IGNORECASE)
39+
# In case multiple matches within one line
40+
for result in results:
41+
yield result
42+
43+
44+
def redact_matching_data(page, matched_values):
45+
"""
46+
Redacts matching values
47+
"""
48+
matches_found = 0
49+
# Loop throughout matching values
50+
for val in matched_values:
51+
matches_found += 1
52+
matching_val_area = page.searchFor(val)
53+
# Redact matching values
54+
[page.addRedactAnnot(area, text=" ", fill=(0, 0, 0))
55+
for area in matching_val_area]
56+
# Apply the redaction
57+
page.apply_redactions()
58+
return matches_found
59+
60+
61+
def frame_matching_data(page, matched_values):
62+
"""
63+
frames matching values
64+
"""
65+
matches_found = 0
66+
# Loop throughout matching values
67+
for val in matched_values:
68+
matches_found += 1
69+
matching_val_area = page.searchFor(val)
70+
for area in matching_val_area:
71+
if isinstance(area, fitz.fitz.Rect):
72+
# Draw a rectangle around matched values
73+
annot = page.addRectAnnot(area)
74+
# , fill = fitz.utils.getColor('black')
75+
annot.setColors(stroke=fitz.utils.getColor('red'))
76+
# If you want to remove matched data
77+
#page.addFreetextAnnot(area, ' ')
78+
annot.update()
79+
return matches_found
80+
81+
82+
def highlight_matching_data(page, matched_values, type):
83+
"""
84+
Highlight matching values
85+
"""
86+
matches_found = 0
87+
# Loop throughout matching values
88+
for val in matched_values:
89+
matches_found += 1
90+
matching_val_area = page.searchFor(val)
91+
# print("matching_val_area",matching_val_area)
92+
highlight = None
93+
if type == 'Highlight':
94+
highlight = page.addHighlightAnnot(matching_val_area)
95+
elif type == 'Squiggly':
96+
highlight = page.addSquigglyAnnot(matching_val_area)
97+
elif type == 'Underline':
98+
highlight = page.addUnderlineAnnot(matching_val_area)
99+
elif type == 'Strikeout':
100+
highlight = page.addStrikeoutAnnot(matching_val_area)
101+
else:
102+
highlight = page.addHighlightAnnot(matching_val_area)
103+
# To change the highlight colar
104+
# highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) })
105+
# highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red'))
106+
# highlight.setColors(colors= fitz.utils.getColor('red'))
107+
highlight.update()
108+
return matches_found
109+
110+
111+
def process_data(input_file: str, output_file: str, search_str: str, pages: Tuple = None, action: str = 'Highlight'):
112+
"""
113+
Process the pages of the PDF File
114+
"""
115+
# Open the PDF
116+
pdfDoc = fitz.open(input_file)
117+
# Save the generated PDF to memory buffer
118+
output_buffer = BytesIO()
119+
total_matches = 0
120+
# Iterate through pages
121+
for pg in range(pdfDoc.pageCount):
122+
# If required for specific pages
123+
if pages:
124+
if str(pg) not in pages:
125+
continue
126+
# Select the page
127+
page = pdfDoc[pg]
128+
# Get Matching Data
129+
# Split page by lines
130+
page_lines = page.getText("text").split('\n')
131+
matched_values = search_for_text(page_lines, search_str)
132+
if matched_values:
133+
if action == 'Redact':
134+
matches_found = redact_matching_data(page, matched_values)
135+
elif action == 'Frame':
136+
matches_found = frame_matching_data(page, matched_values)
137+
elif action in ('Highlight', 'Squiggly', 'Underline', 'Strikeout'):
138+
matches_found = highlight_matching_data(
139+
page, matched_values, action)
140+
else:
141+
matches_found = highlight_matching_data(
142+
page, matched_values, 'Highlight')
143+
total_matches += matches_found
144+
print(f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}")
145+
# Save to output
146+
pdfDoc.save(output_buffer)
147+
pdfDoc.close()
148+
# Save the output buffer to the output file
149+
with open(output_file, mode='wb') as f:
150+
f.write(output_buffer.getbuffer())
151+
152+
153+
def remove_highlght(input_file: str, output_file: str, pages: Tuple = None):
154+
# Open the PDF
155+
pdfDoc = fitz.open(input_file)
156+
# Save the generated PDF to memory buffer
157+
output_buffer = BytesIO()
158+
# Initialize a counter for annotations
159+
annot_found = 0
160+
# Iterate through pages
161+
for pg in range(pdfDoc.pageCount):
162+
# If required for specific pages
163+
if pages:
164+
if str(pg) not in pages:
165+
continue
166+
# Select the page
167+
page = pdfDoc[pg]
168+
annot = page.firstAnnot
169+
while annot:
170+
annot_found += 1
171+
page.deleteAnnot(annot)
172+
annot = annot.next
173+
if annot_found >= 0:
174+
print(f"Annotation(s) Found In The Input File: {input_file}")
175+
# Save to output
176+
pdfDoc.save(output_buffer)
177+
pdfDoc.close()
178+
# Save the output buffer to the output file
179+
with open(output_file, mode='wb') as f:
180+
f.write(output_buffer.getbuffer())
181+
182+
183+
184+
def process_file(**kwargs):
185+
"""
186+
To process one single file
187+
Redact, Frame, Highlight... one PDF File
188+
Remove Highlights from a single PDF File
189+
"""
190+
input_file = kwargs.get('input_file')
191+
output_file = kwargs.get('output_file')
192+
if output_file is None:
193+
output_file = input_file
194+
search_str = kwargs.get('search_str')
195+
pages = kwargs.get('pages')
196+
# Redact, Frame, Highlight, Squiggly, Underline, Strikeout, Remove
197+
action = kwargs.get('action')
198+
if action == "Remove":
199+
# Remove the Highlights except Redactions
200+
remove_highlght(input_file=input_file,
201+
output_file=output_file, pages=pages)
202+
else:
203+
process_data(input_file=input_file, output_file=output_file,
204+
search_str=search_str, pages=pages, action=action)
205+
206+
207+
def process_folder(**kwargs):
208+
"""
209+
Redact, Frame, Highlight... all PDF Files within a specified path
210+
Remove Highlights from all PDF Files within a specified path
211+
"""
212+
input_folder = kwargs.get('input_folder')
213+
search_str = kwargs.get('search_str')
214+
# Run in recursive mode
215+
recursive = kwargs.get('recursive')
216+
#Redact, Frame, Highlight, Squiggly, Underline, Strikeout, Remove
217+
action = kwargs.get('action')
218+
pages = kwargs.get('pages')
219+
# Loop though the files within the input folder.
220+
for foldername, dirs, filenames in os.walk(input_folder):
221+
for filename in filenames:
222+
# Check if pdf file
223+
if not filename.endswith('.pdf'):
224+
continue
225+
# PDF File found
226+
inp_pdf_file = os.path.join(foldername, filename)
227+
print("Processing file =", inp_pdf_file)
228+
process_file(input_file=inp_pdf_file, output_file=None,
229+
search_str=search_str, action=action, pages=pages)
230+
if not recursive:
231+
break
232+
233+
234+
def is_valid_path(path):
235+
"""
236+
Validates the path inputted and checks whether it is a file path or a folder path
237+
"""
238+
if not path:
239+
raise ValueError(f"Invalid Path")
240+
if os.path.isfile(path):
241+
return path
242+
elif os.path.isdir(path):
243+
return path
244+
else:
245+
raise ValueError(f"Invalid Path {path}")
246+
247+
248+
def parse_args():
249+
"""
250+
Get user command line parameters
251+
"""
252+
parser = argparse.ArgumentParser(description="Available Options")
253+
parser.add_argument('-i', '--input_path', dest='input_path', type=is_valid_path,
254+
required=True, help="Enter the path of the file or the folder to process")
255+
parser.add_argument('-a', '--action', dest='action', choices=['Redact', 'Frame', 'Highlight', 'Squiggly', 'Underline', 'Strikeout', 'Remove'], type=str,
256+
default='Highlight', help="Choose whether to Redact or to Frame or to Highlight or to Squiggly or to Underline or to Strikeout or to Remove")
257+
parser.add_argument('-p', '--pages', dest='pages', type=tuple,
258+
help="Enter the pages to consider e.g.: [2,4]")
259+
action = parser.parse_known_args()[0].action
260+
if action != 'Remove':
261+
parser.add_argument('-s', '--search_str', dest='search_str' # lambda x: os.path.has_valid_dir_syntax(x)
262+
, type=str, required=True, help="Enter a valid search string")
263+
path = parser.parse_known_args()[0].input_path
264+
if os.path.isfile(path):
265+
parser.add_argument('-o', '--output_file', dest='output_file', type=str # lambda x: os.path.has_valid_dir_syntax(x)
266+
, help="Enter a valid output file")
267+
if os.path.isdir(path):
268+
parser.add_argument('-r', '--recursive', dest='recursive', default=False, type=lambda x: (
269+
str(x).lower() in ['true', '1', 'yes']), help="Process Recursively or Non-Recursively")
270+
args = vars(parser.parse_args())
271+
# To Display The Command Line Arguments
272+
print("## Command Arguments #################################################")
273+
print("\n".join("{}:{}".format(i, j) for i, j in args.items()))
274+
print("######################################################################")
275+
return args
276+
277+
278+
if __name__ == '__main__':
279+
# Parsing command line arguments entered by user
280+
args = parse_args()
281+
# If File Path
282+
if os.path.isfile(args['input_path']):
283+
# Extracting File Info
284+
extract_info(input_file=args['input_path'])
285+
# Process a file
286+
process_file(
287+
input_file=args['input_path'], output_file=args['output_file'],
288+
search_str=args['search_str'] if 'search_str' in (args.keys()) else None,
289+
pages=args['pages'], action=args['action']
290+
)
291+
# If Folder Path
292+
elif os.path.isdir(args['input_path']):
293+
# Process a folder
294+
process_folder(
295+
input_folder=args['input_path'],
296+
search_str=args['search_str'] if 'search_str' in (args.keys()) else None,
297+
action=args['action'], pages=args['pages'], recursive=args['recursive']
298+
)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
PyMuPDF==1.18.9

0 commit comments

Comments
 (0)