|
| 1 | +""" |
| 2 | +爬取Gank.io API接口的数据到MySQL |
| 3 | +""" |
| 4 | +import requests as r |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +import pymysql |
| 7 | + |
| 8 | +# 接口地址 |
| 9 | +search_api_base_url = 'https://gank.io/api/data/' |
| 10 | + |
| 11 | +# 各种分类的表名:Android,iOS,休息视频,福利,拓展资源,前端,瞎推荐,App |
| 12 | +category_list = ["android", "ios", "video", "meizi", "other", "fed", "random", "app"] |
| 13 | + |
| 14 | +# 图片表名 |
| 15 | +pic_table_name = 'pics' |
| 16 | + |
| 17 | +# 请求分类字段列表 |
| 18 | +type_list = ["Android", "iOS", "休息视频", "福利", "拓展资源", "前端", "瞎推荐", "App"] |
| 19 | + |
| 20 | +# 表字段名 |
| 21 | +column_list = ('_id', 'createdAt', 'dsec', 'publishedAt', 'source', 'type', 'url', 'used', 'who') |
| 22 | + |
| 23 | +# 图片表字段名 |
| 24 | +pic_column_list = ('_id', 'url') |
| 25 | + |
| 26 | + |
| 27 | +# 创建数据库 |
| 28 | +def create_db(): |
| 29 | + conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306) |
| 30 | + cursor = conn.cursor() |
| 31 | + cursor.execute("Create Database If Not Exists gank Character Set UTF8MB4") |
| 32 | + conn.close() |
| 33 | + conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='gank') |
| 34 | + return conn |
| 35 | + |
| 36 | + |
| 37 | +# 创建数据库表 |
| 38 | +def init_tables(c, table): |
| 39 | + c.execute( |
| 40 | + ("CREATE TABLE IF Not Exists {table}" |
| 41 | + "(_id CHAR(24) PRIMARY KEY," |
| 42 | + "createdAt TEXT NOT NULL," |
| 43 | + "dsec TEXT NOT NULL," |
| 44 | + "publishedAt TEXT NOT NULL," |
| 45 | + "source TEXT NOT NULL," |
| 46 | + "type TEXT NOT NULL," |
| 47 | + "url TEXT NOT NULL," |
| 48 | + "used TEXT NOT NULL," |
| 49 | + "who TEXT NOT NULL)").format(table=table)) |
| 50 | + |
| 51 | + |
| 52 | +# 创建图表 |
| 53 | +def init_pic_table(c, table): |
| 54 | + c.execute( |
| 55 | + ("CREATE TABLE IF Not Exists {table} " |
| 56 | + "(id INT AUTO_INCREMENT PRIMARY KEY," |
| 57 | + "_id CHAR(24)," |
| 58 | + "url TEXT NOT NULL)").format(table=table)) |
| 59 | + |
| 60 | + |
| 61 | +# 把数据插入到数据库中 |
| 62 | +def insert_data(c, table, column, data): |
| 63 | + try: |
| 64 | + keys = ', '.join(column) |
| 65 | + values = ', '.join(['%s'] * len(data)) |
| 66 | + sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values) |
| 67 | + c.execute(sql, tuple(data)) |
| 68 | + db.commit() |
| 69 | + except Exception as e: |
| 70 | + print(e) |
| 71 | + db.rollback() |
| 72 | + |
| 73 | + |
| 74 | +# 查询数据库表的方法 |
| 75 | +def query_data(c, table): |
| 76 | + try: |
| 77 | + sql = 'SELECT * FROM {table}'.format(table=table) |
| 78 | + c.execute(sql) |
| 79 | + print('共有 %d 行数据' % c.rowcount) |
| 80 | + row = c.fetchone() |
| 81 | + while row: |
| 82 | + print(row) |
| 83 | + row = c.fetchone() |
| 84 | + except Exception as e: |
| 85 | + print(e) |
| 86 | + |
| 87 | + |
| 88 | +# 爬取接口数据的方法 |
| 89 | +def fetch_data(c, pos): |
| 90 | + page_count = 1 |
| 91 | + while True: |
| 92 | + resp = r.get(search_api_base_url + type_list[pos] + '/50/' + str(page_count)) |
| 93 | + result_json = resp.json() |
| 94 | + print("抓取:", resp.url) |
| 95 | + if len(result_json['results']) > 0: |
| 96 | + for result in result_json['results']: |
| 97 | + data_list = [result['_id'], |
| 98 | + result['createdAt'], |
| 99 | + result['desc'], |
| 100 | + result['publishedAt'], |
| 101 | + result.get('source', ''), |
| 102 | + result['type'], |
| 103 | + result['url'], |
| 104 | + 1 if result['used'] else 0, |
| 105 | + result.get('who', '') if result.get('who', '') is not None else ''] |
| 106 | + insert_data(c, category_list[pos], column_list, data_list) |
| 107 | + if 'images' in result: |
| 108 | + for image in result['images']: |
| 109 | + insert_data(c, pic_table_name, pic_column_list, [result['_id'], image]) |
| 110 | + page_count += 1 |
| 111 | + else: |
| 112 | + break |
| 113 | + |
| 114 | + |
| 115 | +if __name__ == '__main__': |
| 116 | + db = create_db() |
| 117 | + cursor = db.cursor() |
| 118 | + # for category in category_list: |
| 119 | + # init_tables(cursor, category) |
| 120 | + # init_pic_table(cursor, pic_table_name) |
| 121 | + # for i in range(0, len(category_list)): |
| 122 | + # fetch_data(cursor, i) |
| 123 | + query_data(cursor, 'Android') |
| 124 | + cursor.close() |
0 commit comments