Skip to content

Commit ab9d3ee

Browse files
committed
1/15
1 parent 9e541d5 commit ab9d3ee

File tree

5 files changed

+43
-25
lines changed

5 files changed

+43
-25
lines changed

mm131/.idea/mm131.iml

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

mm131/mm131/items.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,22 @@
44

55

66
class Item(scrapy.Item):
7+
78
#标题
89
image_title = scrapy.Field()
9-
#图片Id
10+
#图片Id
1011
image_id = scrapy.Field()
1112
#来自哪里
1213
image_from = scrapy.Field()
1314
#分类编号
1415
category_code = scrapy.Field()
16+
#图片保存在数据库的目录
17+
image_url_dir = scrapy.Field()
1518
#图片路径(保存在数据库的)
1619
image_url = scrapy.Field()
1720
#存储的目录=disk+image_url(磁盘绝对路径)
18-
path = scrapy.Field()
21+
dir_path = scrapy.Field()
22+
#存储的目录=disk+image_url(磁盘绝对路径)+file_name.jpg
23+
file_path = scrapy.Field()
1924
#每张图片的访问下载地址
2025
image_down_url = scrapy.Field()

mm131/mm131/settings.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@
2323
}
2424

2525
ITEM_PIPELINES = {
26-
'mm131.mysqlpipelines.MySqlPipeline': 300,
26+
#'mm131.mysqlpipelines.MySqlPipeline': 300,
2727
#'mm131.filepipelines.FilePipeline': 200,
28+
'mm131.testpipelines.TestPipeline': 300,
2829
}
2930

3031
#Mysql数据库的配置信息

mm131/mm131/spiders/spider.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import random
1111
import string
1212
from lxml import html
13-
from pypinyin import pinyin, lazy_pinyin
1413
reload(sys)
1514
sys.setdefaultencoding('utf-8')
1615

@@ -54,37 +53,39 @@ def parse_one(self, response):
5453
image = html.fromstring(image)
5554
image_html_url = image.xpath(u'.//a/@href')[0].encode('utf-8')
5655
item = Item()
57-
print images
56+
#图片标题
5857
item['image_title'] = image.xpath(u".//a/img/@alt")[0].encode('utf-8')
59-
print item['image_title']
60-
#随机生成10位图片文件夹名称 mm131/xinggan/028e3md7
58+
#随机生成10位图片文件夹名称 mm131/xinggan/028e3md7 salt
6159
salt = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
62-
image_url = self.image_from + '/' + category_code + '/' + salt
63-
item['image_url'] = image_url
64-
path = self.disk + '/' + category_code + '/' + salt
60+
#图片路径目录(保存在数据库的)image_url_dir
61+
image_url_dir = self.image_from + '/' + category_code + '/' + salt
62+
item['image_url_dir'] = image_url_dir
63+
dir_path = self.disk + '/' + category_code + '/' + salt
64+
item['dir_path'] = dir_path
6565
#创建分类文件夹
66-
if not os.path.isdir(path):
67-
os.makedirs(path)
66+
if not os.path.isdir(dir_path):
67+
os.makedirs(dir_path)
6868
item['image_id'] = str(uuid.uuid1())
6969
item['category_code'] = category_code
7070
item['image_from'] = self.image_from
71-
print item
72-
yield Request(image_html_url, meta={'item': item, 'path':path}, callback=self.parse_two)
71+
# print 'parse_one图片Id' + item['image_id']
72+
# print 'parse_one图片标题' + item['image_title']
73+
# print 'parse_one图片图片保存在数据库的目录' + item['image_url_dir']
74+
# print 'parse_one图片存储的目录' + item['dir_path']
75+
yield Request(image_html_url, meta={'item': item}, callback=self.parse_two)
7376

7477
#1.获取每个image的标题,url入口,分类
7578
#2.随机生成一个image_id
7679
def parse_two(self, response):
7780
item = response.meta['item']
7881
category_code = item['category_code']
79-
path = response.meta['path']
80-
is_page_last = response.xpath(u'//div/a[@class="page-ch"]/text()="下一页"').extract()[0].encode('utf-8')
81-
page_last_url = response.xpath(u'//div/a[@class="page-ch" and text()="下一页"]/@href').extract()[0].encode('utf-8')
82+
is_page_last = response.xpath(u'.//div/a[@class="page-ch"]/text()="下一页"').extract()[0].encode('utf-8')
83+
page_last_url = response.xpath(u'.//div/a[@class="page-ch" and text()="下一页"]/@href').extract()[0].encode('utf-8')
8284
item['image_down_url'] = response.xpath(u'//div[@class="content-pic"]/a/img/@src').extract()[0].encode('utf-8')
83-
if(int(is_page_last) == 1):
84-
yield Request(self.main_url+category_code+'/'+page_last_url, meta={'item': item, 'path': path}, callback=self.parse_two)
85-
file_name = re.sub(r'[^0-9]', '', str(datetime.datetime.now()))
86-
item['path'] = path + '/' + file_name + '.jpg'
87-
item['image_url'] = item['image_url'] + '/' + file_name + '.jpg'
85+
file_name = re.sub(r'[^0-9]', '', str(datetime.datetime.now()))
86+
item['file_path'] = item['dir_path'] + '/' + file_name + '.jpg'
87+
item['image_url'] = item['image_url_dir'] + '/' + file_name + '.jpg'
8888
#判断是否有下一页
89-
print item
90-
yield item
89+
yield item
90+
if(int(is_page_last) == 1):
91+
yield Request(self.main_url+category_code+'/'+page_last_url, meta={'item': item}, callback=self.parse_two)

mm131/mm131/testpipelines.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
from twisted.enterprise import adbapi
3+
import pymysql
4+
import pymysql.cursors
5+
6+
class TestPipeline(object):
7+
def process_item(self, item, spider):
8+
print 'parse_two图片Id' + item['image_id']
9+
print 'parse_two图片标题' + item['image_title']
10+
print 'parse_two图片保存在数据库的目录' + item['image_url']
11+
print 'parse_two图片存储的目录' + item['file_path']

0 commit comments

Comments
 (0)