@@ -1328,28 +1328,11 @@ from urllib.parse import quote, quote_plus, unquote, unquote_plus
1328
1328
Scraping
1329
1329
--------
1330
1330
``` python
1331
- # $ pip3 install beautifulsoup4
1332
- from http.cookiejar import CookieJar
1333
- from urllib.error import HTTPError, URLError
1334
- from urllib.request import build_opener, HTTPCookieProcessor
1335
- from bs4 import BeautifulSoup
1336
-
1337
- def scrape (url ):
1338
- """ Returns tree of HTML elements located at URL."""
1339
- jar = CookieJar()
1340
- opener = build_opener(HTTPCookieProcessor(jar))
1341
- opener.addheaders = [(' User-agent' , ' Mozilla/5.0' )]
1342
- try :
1343
- html = opener.open(url)
1344
- except ValueError as error:
1345
- return print (f ' Malformed URL: { url} . \n { error} ' )
1346
- except (HTTPError, URLError) as error:
1347
- return print (f " Can't find URL: { url} . \n { error} " )
1348
- return BeautifulSoup(html, ' html.parser' )
1349
- ```
1350
-
1351
- ``` python
1352
- >> > document = scrape(' https://en.wikipedia.org/wiki/Python_(programming_language)' )
1331
+ # $ pip3 install requests beautifulsoup4
1332
+ >> > import requests
1333
+ >> > from bs4 import BeautifulSoup
1334
+ >> > page = requests.get(' https://en.wikipedia.org/wiki/Python_(programming_language)' )
1335
+ >> > document = BeautifulSoup(page.text, ' html.parser' )
1353
1336
>> > table = document.find(' table' , class_ = ' infobox vevent' )
1354
1337
>> > rows = table.find_all(' tr' )
1355
1338
>> > website = rows[11 ].find(' a' )[' href' ]
0 commit comments