Skip to content

Commit aa12fe6

Browse files
committed
weslmusek@gmail.com
1 parent 2bfc1e3 commit aa12fe6

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

class_html_parsing.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import re
2+
from bs4 import BeautifulSoup
3+
4+
ITEM_HTML = '''<!DOCTYPE html>
5+
<html lang="en" dir="ltr">
6+
<head>
7+
<meta charset="utf-8">
8+
<title></title>
9+
</head>
10+
<body>
11+
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
12+
13+
<article class="product_pod">
14+
15+
<div class="image_container">
16+
17+
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" alt="A Light in the Attic" class="thumbnail"></a>
18+
</div>
19+
20+
<p class="star-rating Three">
21+
<i class="icon-star"></i>
22+
<i class="icon-star"></i>
23+
<i class="icon-star"></i>
24+
<i class="icon-star"></i>
25+
<i class="icon-star"></i>
26+
</p>
27+
28+
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
29+
30+
<div class="product_price">
31+
32+
<p class="price_color">£51.77</p>
33+
34+
<p class="instock availability">
35+
<i class="icon-ok"></i>
36+
37+
In stock
38+
</p>
39+
40+
<form>
41+
<button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
42+
</form>
43+
</div>
44+
</article>
45+
</li>
46+
</body>
47+
</html>
48+
'''
49+
50+
soup = BeautifulSoup(ITEM_HTML, 'html.parser')
51+
52+
53+
def find_item_name():
54+
locator = 'article.product_pod h3 a' # CSS locator, location of what we want
55+
item_link = soup.select_one(locator)
56+
item_name = item_link.attrs['title']
57+
print(item_name)
58+
59+
60+
def find_item_link():
61+
locator = 'article.product_pod h3 a' # CSS locator, location of what we want
62+
item_link = soup.select_one(locator).attrs['href'] # this is a relative link
63+
print(item_link)
64+
65+
66+
'''
67+
def find_item_price():
68+
# locator = 'article.product_pod p.price_color'
69+
# item_link = soup.select_one(locator)
70+
prod_price = soup.find('p', {'class': 'price_color'}).string
71+
print(prod_price[1:]) # this extracts a strong not a number
72+
'''
73+
74+
75+
def find_item_price():
76+
locator = 'article.product_pod p.price_color'
77+
item_price = soup.select_one(locator).string
78+
79+
pattern = '£([0-9]+\.[0-9]+)'
80+
matcher = re.search(pattern, item_price)
81+
print(matcher.group(0)) # £51.77
82+
print(float(matcher.group(1))) # 51.77
83+
84+
85+
def find_item_rating():
86+
locator = 'article.product_pod p.star-rating'
87+
star_rating_tag = soup.select_one(locator)
88+
# print(star_rating_tag)
89+
classes = star_rating_tag.attrs['class']
90+
rating_classes = [r for r in classes if r != 'star-rating']
91+
print(rating_classes[0])
92+
93+
94+
find_item_name()
95+
find_item_link()
96+
find_item_price()
97+
find_item_rating()

middle_html.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@
4747
</html>
4848
'''
4949

50+
51+
class ParsedItem:
52+
'''
53+
A class to take in an html page or part of it and find properties of items in it
54+
'''
55+
56+
5057
soup = BeautifulSoup(ITEM_HTML, 'html.parser')
5158

5259

0 commit comments

Comments
 (0)