Skip to content

Commit 0366310

Browse files
committed
Add solutions to new exercises in section 1
1 parent 1bf1298 commit 0366310

File tree

1 file changed

+60
-33
lines changed

1 file changed

+60
-33
lines changed

ch13-interact-with-pdf-files/1-extract-text-from-a-pdf.py

Lines changed: 60 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,71 @@
22
# Solutions to review exercises
33

44

5-
import os
6-
from PyPDF2 import PdfFileReader, PdfFileWriter
7-
8-
5+
# ***********
96
# Exercise 1
10-
path = "C:/python-basics-exercises/ch13-interact-with-pdf-files/\
11-
practice_files"
7+
#
8+
# In the Chapter 13 Practice Files directory there is a PDF file called
9+
# `zen.pdf`. Create a `PdfFileReader` from this PDF.
10+
# ***********
11+
12+
# Before you can do anything, you need to import the right objects from
13+
# the PyPDF2 and pathlib libraries
14+
from pathlib import Path
15+
from PyPDF2 import PdfFileReader
1216

13-
input_file_path = os.path.join(path, "The Whistling Gypsy.pdf")
14-
input_file = PdfFileReader(input_file_path)
17+
# To create a PdfFileReader instance, you need to path to the PDF file.
18+
# We'll assume you downloaded the solutions folder and extracted it into
19+
# the home directory on your computer. If this is not the case, you'll
20+
# need to update the path below.
21+
pdf_path = Path.home() / "python-basics-exercises/ch13-interact-with-pdf-files" \
22+
"/practice_files/zen.pdf"
1523

16-
# Display meta-data about file
17-
print("Title:", input_file.getDocumentInfo().title)
18-
print("Author:", input_file.getDocumentInfo().author)
19-
print("Number of pages:", input_file.getNumPages())
24+
# Now you can create the PdfFileReader instance. Remember that
25+
# PdfFileReader objects can only be instantiated with path strings, not
26+
# Path objects!
27+
pdf_reader = PdfFileReader(str(pdf_path))
2028

2129

30+
# ***********
2231
# Exercise 2
23-
# Specify and open output text file
24-
output_file_path = os.path.join(path, "Output/The Whistling Gypsy.txt")
25-
with open(output_file_path, "w") as output_file:
26-
# Extract every page of text
27-
for page_num in range(0, input_file.getNumPages()):
28-
text = input_file.getPage(page_num).extractText()
29-
output_file.write(text)
30-
31-
# NOTE: On some machines, you may get a UnicodeDecodeError when
32-
# writing the file. To fix this, replace line 25 with the following:
33-
# with open(output_file_path, "w", encoding="utf-8")
32+
#
33+
# Using the `PdfFileReader` instance from Exercise 1, print the total
34+
# number of pages in the PDF.
35+
# ***********
36+
37+
# Use .getNumPages() to get the number of pages, then print the result
38+
# using the print() built-in
39+
num_pages = pdf_reader.getNumPages()
40+
print(num_pages)
41+
3442

43+
# ***********
3544
# Exercise 3
36-
# Save file without cover page
37-
output_PDF = PdfFileWriter()
38-
for page_num in range(1, input_file.getNumPages()):
39-
output_PDF.addPage(input_file.getPage(page_num))
40-
41-
output_file_name = os.path.join(
42-
path, "Output/The Whistling Gypsy un-covered.pdf"
43-
)
44-
with open(output_file_name, "wb") as output_file:
45-
output_PDF.write(output_file)
45+
#
46+
# Print the text from the first page of the PDF in Exercise 1.
47+
# ***********
48+
49+
# Use .getPage() to get the first page. Remember pages are indexed
50+
# starting with 0!
51+
first_page = pdf_reader.getPage(0)
52+
53+
# Then use .extractText() to extract the text
54+
text = first_page.extractText()
55+
56+
# Finally, print the text
57+
print(text)
58+
59+
60+
# **NOTE**: The text in zen.pdf is from "The Zen Of Python" written by
61+
# Tim Peters in 2004. The Zen is a collection of 19 guiding principles
62+
# for developing with Python. The story goes that there are actually 20
63+
# such principles, but only 19 were written down!
64+
#
65+
# You can see the original submission for The Zen of Python in PEP20:
66+
# https://www.python.org/dev/peps/pep-0020/
67+
#
68+
# For some historical context surrounding The Zen, see:
69+
# https://mail.python.org/pipermail/python-list/1999-June/001951.html
70+
#
71+
# Author Al Seigart has an interpretation of The Zen on his blog:
72+
# https://inventwithpython.com/blog/2018/08/17/the-zen-of-python-explained/

0 commit comments

Comments
 (0)