Skip to content

Commit 94d65bc

Browse files
authored
Remove broken links (public-api-lists#16)
* Add broken-links to file This file contains list of broken links removed from readme.md Some links may be temporarily down or moved So that we can investigate on these links and upload working links in future * Automated broken-links collector To implement this file you have to download readme.md(raw file) and broken_link_finder.py and execute them in your machine This program search the entire readme.txt(raw file) and collects the broken-links in error.txt file Some webpage return error due to bot security feature of webpage but works fine when we manually tried in browser Hence a manual check is needed only on those filtered link which is saved in error.txt is needed Thus this program help to narrow down our search to find broken-links * Remove broken-links The are total of 70 links that doesn't work maybe temporarily or permanently Maybe some webpage moved to new pages So I removed them from readme.md and saved them in broken-links.md for future
1 parent 21478e6 commit 94d65bc

File tree

3 files changed

+194
-70
lines changed

3 files changed

+194
-70
lines changed

.github/broken-link-collectorr.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""
2+
Note:
3+
This Program fillters out most of the good links and collect links that returned error save them in separate file
4+
Some of the links may work in browser but not in python due to security of webpage
5+
So a manual check on those filtered record is needed
6+
Since the program already filtered most of the good links we can easily check the reaming link and save time
7+
This program takes a while depending on internet speed
8+
9+
Instruction:
10+
Download the bad_link_filter and readme as raw file
11+
Then execute in your machine
12+
The bad links will be saved in error.txt file
13+
Then you have to manually check the links mentioned in error.txt file and remove the good links from the file
14+
15+
"""
16+
17+
def is_url_working(url): #Check the status code of webpage
18+
import requests
19+
try:
20+
headers = {
21+
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
22+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
23+
'Accept-Language': 'en-us,en;q=0.5',
24+
'Accept-Encoding': 'gzip,deflate',
25+
'Connection': 'keep-alive',
26+
'Access-Control-Allow-Methods': 'POST',
27+
'Access-Control-Allow-Headers': 'X-PINGOTHER, Content-Type',
28+
'Pragma': 'no-cache',
29+
'Cache-Control': 'no-cache',
30+
}
31+
proxies = {"http": None,"https": None}
32+
response=requests.get(url,headers=headers,proxies=proxies)
33+
status=response.status_code
34+
if status>=400:
35+
return status
36+
except requests.exceptions.ConnectionError as ce:
37+
return 'HTTPSConnectionPool error'
38+
except Exception as e:
39+
return e
40+
41+
def func(indexes):
42+
error_links=[]
43+
print('InProgress, Sections completed will be shown below.Please wait for a while')
44+
for index,section in indexes.items():
45+
for title,row in section.items():
46+
error=is_url_working(row['link'])
47+
if error:
48+
e={
49+
'index':index,
50+
'title':title,
51+
'link': row['link'],
52+
'error':error
53+
}
54+
error_links.append(e)
55+
print(index,' section completed')
56+
return error_links
57+
58+
def get_lines_from_file(location): #open,read,return lines after filtering empty lines and spaces
59+
lines=[]
60+
with open(location,'r') as file:
61+
lines=[line.strip() for line in file.readlines() if line.strip()]
62+
return lines
63+
64+
def line_to_dict(line): #covert api row to dict
65+
line=line.strip().split('|')
66+
name,link=line[1].strip().split('](')
67+
name,link=name[1:],link[:-1]
68+
row={
69+
'link':link,
70+
'description':line[2],
71+
'auth':line[3],
72+
'https':line[4],
73+
'cors':line[5],
74+
}
75+
return name,row
76+
77+
def section_to_dict(lines,ind): #convert section to dict
78+
section={}
79+
while ind<len(lines):
80+
if 'Back to Index' in lines[ind]: #Break a section
81+
break
82+
name,row=line_to_dict(lines[ind])
83+
section[name]=row
84+
ind+=1
85+
return ind,section
86+
87+
def get_section_wise_dict(lines): #convert unstructured lines to section wise dict
88+
ind=0
89+
indexes={}
90+
while ind<len(lines):
91+
if '###' in lines[ind]: #Enters a section
92+
name=lines[ind][3:].strip()
93+
ind,indexes[name]=section_to_dict(lines,ind+1+1+1)
94+
ind+=1
95+
return indexes
96+
97+
def link_to_error_file(error_links): #Enters the bad links to a file which further requires manual check
98+
lines=[]
99+
for row in error_links:
100+
statement='| {} | [{}]({}) | {} |'.format(row['index'], row['title'], row['link'], str(row['error']))
101+
lines.append(statement)
102+
with open('error.txt','w') as file:
103+
file.write('\n#Manual check has to be done on following links#\n\n')
104+
file.write('| Section | API |Error/Satus Code |\n')
105+
file.write('|---|---|---|\n')
106+
for line in lines:
107+
file.write(line)
108+
file.write('\n')
109+
print("Written to file")
110+
print('Manual check has to be done for the links saved in error.txt')
111+
112+
location= input('Location of readme public api readme file: ') #Get location of raw readme file
113+
lines=get_lines_from_file(location)
114+
indexes=get_section_wise_dict(lines)
115+
error_links=func(indexes)
116+
link_to_error_file(error_links)

0 commit comments

Comments
 (0)