Skip to content

Commit 3d1516d

Browse files
committed
Scraping
1 parent a31cc71 commit 3d1516d

File tree

1 file changed

+5
-22
lines changed

1 file changed

+5
-22
lines changed

README.md

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,28 +1328,11 @@ from urllib.parse import quote, quote_plus, unquote, unquote_plus
13281328
Scraping
13291329
--------
13301330
```python
1331-
# $ pip3 install beautifulsoup4
1332-
from http.cookiejar import CookieJar
1333-
from urllib.error import HTTPError, URLError
1334-
from urllib.request import build_opener, HTTPCookieProcessor
1335-
from bs4 import BeautifulSoup
1336-
1337-
def scrape(url):
1338-
"""Returns tree of HTML elements located at URL."""
1339-
jar = CookieJar()
1340-
opener = build_opener(HTTPCookieProcessor(jar))
1341-
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
1342-
try:
1343-
html = opener.open(url)
1344-
except ValueError as error:
1345-
return print(f'Malformed URL: {url}.\n{error}')
1346-
except (HTTPError, URLError) as error:
1347-
return print(f"Can't find URL: {url}.\n{error}")
1348-
return BeautifulSoup(html, 'html.parser')
1349-
```
1350-
1351-
```python
1352-
>>> document = scrape('https://en.wikipedia.org/wiki/Python_(programming_language)')
1331+
# $ pip3 install requests beautifulsoup4
1332+
>>> import requests
1333+
>>> from bs4 import BeautifulSoup
1334+
>>> page = requests.get('https://en.wikipedia.org/wiki/Python_(programming_language)')
1335+
>>> document = BeautifulSoup(page.text, 'html.parser')
13531336
>>> table = document.find('table', class_='infobox vevent')
13541337
>>> rows = table.find_all('tr')
13551338
>>> website = rows[11].find('a')['href']

0 commit comments

Comments
 (0)