|
| 1 | +""" |
| 2 | +Note: |
| 3 | + This Program fillters out most of the good links and collect links that returned error save them in separate file |
| 4 | + Some of the links may work in browser but not in python due to security of webpage |
| 5 | + So a manual check on those filtered record is needed |
| 6 | + Since the program already filtered most of the good links we can easily check the reaming link and save time |
| 7 | + This program takes a while depending on internet speed |
| 8 | +
|
| 9 | + Instruction: |
| 10 | + Download the bad_link_filter and readme as raw file |
| 11 | + Then execute in your machine |
| 12 | + The bad links will be saved in error.txt file |
| 13 | + Then you have to manually check the links mentioned in error.txt file and remove the good links from the file |
| 14 | + |
| 15 | +""" |
| 16 | + |
| 17 | +def is_url_working(url): #Check the status code of webpage |
| 18 | + import requests |
| 19 | + try: |
| 20 | + headers = { |
| 21 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', |
| 22 | + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
| 23 | + 'Accept-Language': 'en-us,en;q=0.5', |
| 24 | + 'Accept-Encoding': 'gzip,deflate', |
| 25 | + 'Connection': 'keep-alive', |
| 26 | + 'Access-Control-Allow-Methods': 'POST', |
| 27 | + 'Access-Control-Allow-Headers': 'X-PINGOTHER, Content-Type', |
| 28 | + 'Pragma': 'no-cache', |
| 29 | + 'Cache-Control': 'no-cache', |
| 30 | + } |
| 31 | + proxies = {"http": None,"https": None} |
| 32 | + response=requests.get(url,headers=headers,proxies=proxies) |
| 33 | + status=response.status_code |
| 34 | + if status>=400: |
| 35 | + return status |
| 36 | + except requests.exceptions.ConnectionError as ce: |
| 37 | + return 'HTTPSConnectionPool error' |
| 38 | + except Exception as e: |
| 39 | + return e |
| 40 | + |
| 41 | +def func(indexes): |
| 42 | + error_links=[] |
| 43 | + print('InProgress, Sections completed will be shown below.Please wait for a while') |
| 44 | + for index,section in indexes.items(): |
| 45 | + for title,row in section.items(): |
| 46 | + error=is_url_working(row['link']) |
| 47 | + if error: |
| 48 | + e={ |
| 49 | + 'index':index, |
| 50 | + 'title':title, |
| 51 | + 'link': row['link'], |
| 52 | + 'error':error |
| 53 | + } |
| 54 | + error_links.append(e) |
| 55 | + print(index,' section completed') |
| 56 | + return error_links |
| 57 | + |
| 58 | +def get_lines_from_file(location): #open,read,return lines after filtering empty lines and spaces |
| 59 | + lines=[] |
| 60 | + with open(location,'r') as file: |
| 61 | + lines=[line.strip() for line in file.readlines() if line.strip()] |
| 62 | + return lines |
| 63 | + |
| 64 | +def line_to_dict(line): #covert api row to dict |
| 65 | + line=line.strip().split('|') |
| 66 | + name,link=line[1].strip().split('](') |
| 67 | + name,link=name[1:],link[:-1] |
| 68 | + row={ |
| 69 | + 'link':link, |
| 70 | + 'description':line[2], |
| 71 | + 'auth':line[3], |
| 72 | + 'https':line[4], |
| 73 | + 'cors':line[5], |
| 74 | + } |
| 75 | + return name,row |
| 76 | + |
| 77 | +def section_to_dict(lines,ind): #convert section to dict |
| 78 | + section={} |
| 79 | + while ind<len(lines): |
| 80 | + if 'Back to Index' in lines[ind]: #Break a section |
| 81 | + break |
| 82 | + name,row=line_to_dict(lines[ind]) |
| 83 | + section[name]=row |
| 84 | + ind+=1 |
| 85 | + return ind,section |
| 86 | + |
| 87 | +def get_section_wise_dict(lines): #convert unstructured lines to section wise dict |
| 88 | + ind=0 |
| 89 | + indexes={} |
| 90 | + while ind<len(lines): |
| 91 | + if '###' in lines[ind]: #Enters a section |
| 92 | + name=lines[ind][3:].strip() |
| 93 | + ind,indexes[name]=section_to_dict(lines,ind+1+1+1) |
| 94 | + ind+=1 |
| 95 | + return indexes |
| 96 | + |
| 97 | +def link_to_error_file(error_links): #Enters the bad links to a file which further requires manual check |
| 98 | + lines=[] |
| 99 | + for row in error_links: |
| 100 | + statement='| {} | [{}]({}) | {} |'.format(row['index'], row['title'], row['link'], str(row['error'])) |
| 101 | + lines.append(statement) |
| 102 | + with open('error.txt','w') as file: |
| 103 | + file.write('\n#Manual check has to be done on following links#\n\n') |
| 104 | + file.write('| Section | API |Error/Satus Code |\n') |
| 105 | + file.write('|---|---|---|\n') |
| 106 | + for line in lines: |
| 107 | + file.write(line) |
| 108 | + file.write('\n') |
| 109 | + print("Written to file") |
| 110 | + print('Manual check has to be done for the links saved in error.txt') |
| 111 | + |
| 112 | +location= input('Location of readme public api readme file: ') #Get location of raw readme file |
| 113 | +lines=get_lines_from_file(location) |
| 114 | +indexes=get_section_wise_dict(lines) |
| 115 | +error_links=func(indexes) |
| 116 | +link_to_error_file(error_links) |
0 commit comments