|
| 1 | + |
| 2 | +# [ Edited On 16.2.2016 ] |
| 3 | +# On that date this program was working. |
| 4 | + |
| 5 | +#Warning: For original Bucky's typed lines of code, take a look at the file 27_python.py . |
| 6 | + |
| 7 | +#Description: |
| 8 | +#This file is alternative solution for web crowler. |
| 9 | +# Mayor reason for this is that website BuckysRoom.com is down, so original code doesnot work anymore. |
| 10 | +# Solution description (what this program does): |
| 11 | +#this program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser. |
| 12 | + |
| 13 | + |
| 14 | +# But history is changing and sooner or later this file or program will not work!. |
| 15 | +# On day of the creation this program was working. |
| 16 | + |
| 17 | + |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | +import requests |
| 22 | +from bs4 import BeautifulSoup |
| 23 | + |
| 24 | + |
| 25 | +def trade_spider(max_pages): |
| 26 | + page = 1 |
| 27 | + while page <= max_pages: |
| 28 | + url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page) |
| 29 | + source_code = requests.get(url, allow_redirects=False) |
| 30 | + # just get the code, no headers or anything |
| 31 | + plain_text = source_code.text.encode('ascii', 'replace') |
| 32 | + # BeautifulSoup objects can be sorted through easy |
| 33 | + soup = BeautifulSoup(plain_text,'html.parser') |
| 34 | + for link in soup.findAll('a', {'class': 'user-name'}): |
| 35 | + href = link.get('href') |
| 36 | + title = link.string # just the text, not the HTML |
| 37 | + print(href) |
| 38 | + print(title) |
| 39 | + #get_single_item_data(href) |
| 40 | + page += 1 |
| 41 | + |
| 42 | + |
| 43 | +def get_single_item_data(item_url): |
| 44 | + source_code = requests.get(item_url) |
| 45 | + plain_text = source_code.text |
| 46 | + soup = BeautifulSoup(plain_text,"lxml") |
| 47 | + # if you want to gather photos from that user |
| 48 | + for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user |
| 49 | + photo='https://thenewboston.com'+item_name.get('src') |
| 50 | + print(photo) |
| 51 | + # if you want to gather links for a web crawler |
| 52 | + for link in soup.findAll('a'): |
| 53 | + href = link.get('href') |
| 54 | + print(href) |
| 55 | + |
| 56 | + |
| 57 | +trade_spider(1) |
| 58 | + |
| 59 | + |
| 60 | + |
| 61 | + |
0 commit comments