Skip to content

Commit 720b1e2

Browse files
committed
Scraping
1 parent 751770a commit 720b1e2

File tree

2 files changed

+24
-49
lines changed

2 files changed

+24
-49
lines changed

README.md

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1911,33 +1911,19 @@ retention=<int>|<datetime.timedelta>|<str>
19111911

19121912
Scraping
19131913
--------
1914+
#### Scrapes and prints Python's URL and version number from Wikipedia:
19141915
```python
19151916
# $ pip3 install requests beautifulsoup4
1916-
>>> import requests
1917-
>>> from bs4 import BeautifulSoup
1918-
>>> url = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
1919-
>>> page = requests.get(url)
1920-
>>> doc = BeautifulSoup(page.text, 'html.parser')
1921-
>>> table = doc.find('table', class_='infobox vevent')
1922-
>>> rows = table.find_all('tr')
1923-
>>> link = rows[11].find('a')['href']
1924-
>>> ver = rows[6].find('div').text.split()[0]
1925-
>>> link, ver
1926-
('https://www.python.org/', '3.7.2')
1927-
```
1928-
1929-
### Selenium
1930-
**Library for scraping dynamically generated web content.**
1931-
1932-
```python
1933-
# $ brew cask install chromedriver
1934-
# $ pip3 install selenium
1935-
>>> from selenium import webdriver
1936-
>>> driver = webdriver.Chrome()
1937-
>>> driver.get(url)
1938-
>>> xpath = '//*[@id="mw-content-text"]/div/table[1]/tbody/tr[7]/td/div'
1939-
>>> driver.find_element_by_xpath(xpath).text.split()[0]
1940-
'3.7.2'
1917+
import requests
1918+
from bs4 import BeautifulSoup
1919+
url = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
1920+
page = requests.get(url)
1921+
doc = BeautifulSoup(page.text, 'html.parser')
1922+
table = doc.find('table', class_='infobox vevent')
1923+
rows = table.find_all('tr')
1924+
link = rows[11].find('a')['href']
1925+
ver = rows[6].find('div').text.split()[0]
1926+
print(link, ver)
19411927
```
19421928

19431929

@@ -2049,7 +2035,7 @@ from datetime import datetime
20492035
time_str = datetime.now().strftime('%Y%m%d%H%M%S')
20502036
filename = f'profile-{time_str}.png'
20512037
drawer = output.GraphvizOutput(output_file=filename)
2052-
with PyCallGraph(output=drawer):
2038+
with PyCallGraph(drawer):
20532039
<code_to_be_profiled>
20542040
```
20552041

index.html

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1614,29 +1614,18 @@ <h3 id="retention">Retention</h3>
16141614
<li><strong><code class="python hljs"><span class="hljs-string">'&lt;str&gt;'</span></code> - Max age as a string: <code class="python hljs"><span class="hljs-string">'1 week, 3 days'</span></code>, <code class="python hljs"><span class="hljs-string">'2 months'</span></code>, …</strong></li>
16151615
</ul>
16161616
<h2 id="scraping"><a href="#scraping" name="scraping">#</a>Scraping</h2>
1617+
<h4 id="scrapesandprintspythonsurlandversionnumberfromwikipedia">Scrapes and prints Python's URL and version number from Wikipedia:</h4>
16171618
<pre><code class="python language-python hljs"><span class="hljs-comment"># $ pip3 install requests beautifulsoup4</span>
1618-
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> requests
1619-
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> bs4 <span class="hljs-keyword">import</span> BeautifulSoup
1620-
<span class="hljs-meta">&gt;&gt;&gt; </span>url = <span class="hljs-string">'https://en.wikipedia.org/wiki/Python_(programming_language)'</span>
1621-
<span class="hljs-meta">&gt;&gt;&gt; </span>page = requests.get(url)
1622-
<span class="hljs-meta">&gt;&gt;&gt; </span>doc = BeautifulSoup(page.text, <span class="hljs-string">'html.parser'</span>)
1623-
<span class="hljs-meta">&gt;&gt;&gt; </span>table = doc.find(<span class="hljs-string">'table'</span>, class_=<span class="hljs-string">'infobox vevent'</span>)
1624-
<span class="hljs-meta">&gt;&gt;&gt; </span>rows = table.find_all(<span class="hljs-string">'tr'</span>)
1625-
<span class="hljs-meta">&gt;&gt;&gt; </span>link = rows[<span class="hljs-number">11</span>].find(<span class="hljs-string">'a'</span>)[<span class="hljs-string">'href'</span>]
1626-
<span class="hljs-meta">&gt;&gt;&gt; </span>ver = rows[<span class="hljs-number">6</span>].find(<span class="hljs-string">'div'</span>).text.split()[<span class="hljs-number">0</span>]
1627-
<span class="hljs-meta">&gt;&gt;&gt; </span>link, ver
1628-
(<span class="hljs-string">'https://www.python.org/'</span>, <span class="hljs-string">'3.7.2'</span>)
1629-
</code></pre>
1630-
<h3 id="selenium">Selenium</h3>
1631-
<p><strong>Library for scraping dynamically generated web content.</strong></p>
1632-
<pre><code class="python language-python hljs"><span class="hljs-comment"># $ brew cask install chromedriver</span>
1633-
<span class="hljs-comment"># $ pip3 install selenium</span>
1634-
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> selenium <span class="hljs-keyword">import</span> webdriver
1635-
<span class="hljs-meta">&gt;&gt;&gt; </span>driver = webdriver.Chrome()
1636-
<span class="hljs-meta">&gt;&gt;&gt; </span>driver.get(url)
1637-
<span class="hljs-meta">&gt;&gt;&gt; </span>xpath = <span class="hljs-string">'//*[@id="mw-content-text"]/div/table[1]/tbody/tr[7]/td/div'</span>
1638-
<span class="hljs-meta">&gt;&gt;&gt; </span>driver.find_element_by_xpath(xpath).text.split()[<span class="hljs-number">0</span>]
1639-
<span class="hljs-string">'3.7.2'</span>
1619+
<span class="hljs-keyword">import</span> requests
1620+
<span class="hljs-keyword">from</span> bs4 <span class="hljs-keyword">import</span> BeautifulSoup
1621+
url = <span class="hljs-string">'https://en.wikipedia.org/wiki/Python_(programming_language)'</span>
1622+
page = requests.get(url)
1623+
doc = BeautifulSoup(page.text, <span class="hljs-string">'html.parser'</span>)
1624+
table = doc.find(<span class="hljs-string">'table'</span>, class_=<span class="hljs-string">'infobox vevent'</span>)
1625+
rows = table.find_all(<span class="hljs-string">'tr'</span>)
1626+
link = rows[<span class="hljs-number">11</span>].find(<span class="hljs-string">'a'</span>)[<span class="hljs-string">'href'</span>]
1627+
ver = rows[<span class="hljs-number">6</span>].find(<span class="hljs-string">'div'</span>).text.split()[<span class="hljs-number">0</span>]
1628+
print(link, ver)
16401629
</code></pre>
16411630
<h2 id="web"><a href="#web" name="web">#</a>Web</h2>
16421631
<pre><code class="python language-python hljs"><span class="hljs-comment"># $ pip3 install bottle</span>
@@ -1719,7 +1708,7 @@ <h4 id="generatesapngimageofacallgraphwithhighlightedbottlenecks">Generates a PN
17191708
time_str = datetime.now().strftime(<span class="hljs-string">'%Y%m%d%H%M%S'</span>)
17201709
filename = <span class="hljs-string">f'profile-<span class="hljs-subst">{time_str}</span>.png'</span>
17211710
drawer = output.GraphvizOutput(output_file=filename)
1722-
<span class="hljs-keyword">with</span> PyCallGraph(output=drawer):
1711+
<span class="hljs-keyword">with</span> PyCallGraph(drawer):
17231712
&lt;code_to_be_profiled&gt;
17241713
</code></pre>
17251714
<h2 id="numpy"><a href="#numpy" name="numpy">#</a>NumPy</h2>

0 commit comments

Comments
 (0)