|
| 1 | +# [ Edited On 16.2.2016 ] |
| 2 | +# On that date this program was working. |
| 3 | + |
| 4 | +#Warning: For original Bucky's typed lines of code, take a look at the file 26_python.py . |
| 5 | + |
| 6 | +#Description: |
| 7 | +#This file is alternative solution for web crowler. |
| 8 | +# Mayor reason for this is that website BuckysRoom.com is down, so original code doesnot work anymore. |
| 9 | +# |
| 10 | +# Solution description (what this actually program does - not the same as in the video): |
| 11 | +#This program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation , |
| 12 | +#and goes on every user's profile, and on that profile, |
| 13 | +#it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser. |
| 14 | + |
| 15 | + |
| 16 | +# But history is changing and sooner or later this file or program will not work!. |
| 17 | +# On day of the creation this program was working. |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | + |
| 22 | + |
| 23 | + |
| 24 | +import requests |
| 25 | +from bs4 import BeautifulSoup |
| 26 | + |
| 27 | + |
| 28 | +def trade_spider(max_pages): |
| 29 | + page = 1 |
| 30 | + while page <= max_pages: |
| 31 | + url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page) |
| 32 | + source_code = requests.get(url, allow_redirects=False) |
| 33 | + plain_text = source_code.text.encode('ascii', 'replace') |
| 34 | + soup = BeautifulSoup(plain_text,'html.parser') |
| 35 | + for link in soup.findAll('a', {'class': 'user-name'}): |
| 36 | + href = link.get('href') |
| 37 | + title = link.string |
| 38 | + print(href) |
| 39 | + print(title) |
| 40 | + page += 1 |
| 41 | + |
| 42 | +trade_spider(1) |
0 commit comments