10
10
import random
11
11
import string
12
12
from lxml import html
13
- from pypinyin import pinyin , lazy_pinyin
14
13
reload (sys )
15
14
sys .setdefaultencoding ('utf-8' )
16
15
@@ -54,37 +53,39 @@ def parse_one(self, response):
54
53
image = html .fromstring (image )
55
54
image_html_url = image .xpath (u'.//a/@href' )[0 ].encode ('utf-8' )
56
55
item = Item ()
57
- print images
56
+ #图片标题
58
57
item ['image_title' ] = image .xpath (u".//a/img/@alt" )[0 ].encode ('utf-8' )
59
- print item ['image_title' ]
60
- #随机生成10位图片文件夹名称 mm131/xinggan/028e3md7
58
+ #随机生成10位图片文件夹名称 mm131/xinggan/028e3md7 salt
61
59
salt = '' .join (random .sample (string .ascii_lowercase + string .digits , 10 ))
62
- image_url = self .image_from + '/' + category_code + '/' + salt
63
- item ['image_url' ] = image_url
64
- path = self .disk + '/' + category_code + '/' + salt
60
+ #图片路径目录(保存在数据库的)image_url_dir
61
+ image_url_dir = self .image_from + '/' + category_code + '/' + salt
62
+ item ['image_url_dir' ] = image_url_dir
63
+ dir_path = self .disk + '/' + category_code + '/' + salt
64
+ item ['dir_path' ] = dir_path
65
65
#创建分类文件夹
66
- if not os .path .isdir (path ):
67
- os .makedirs (path )
66
+ if not os .path .isdir (dir_path ):
67
+ os .makedirs (dir_path )
68
68
item ['image_id' ] = str (uuid .uuid1 ())
69
69
item ['category_code' ] = category_code
70
70
item ['image_from' ] = self .image_from
71
- print item
72
- yield Request (image_html_url , meta = {'item' : item , 'path' :path }, callback = self .parse_two )
71
+ # print 'parse_one图片Id' + item['image_id']
72
+ # print 'parse_one图片标题' + item['image_title']
73
+ # print 'parse_one图片图片保存在数据库的目录' + item['image_url_dir']
74
+ # print 'parse_one图片存储的目录' + item['dir_path']
75
+ yield Request (image_html_url , meta = {'item' : item }, callback = self .parse_two )
73
76
74
77
#1.获取每个image的标题,url入口,分类
75
78
#2.随机生成一个image_id
76
79
def parse_two (self , response ):
77
80
item = response .meta ['item' ]
78
81
category_code = item ['category_code' ]
79
- path = response .meta ['path' ]
80
- is_page_last = response .xpath (u'//div/a[@class="page-ch"]/text()="下一页"' ).extract ()[0 ].encode ('utf-8' )
81
- page_last_url = response .xpath (u'//div/a[@class="page-ch" and text()="下一页"]/@href' ).extract ()[0 ].encode ('utf-8' )
82
+ is_page_last = response .xpath (u'.//div/a[@class="page-ch"]/text()="下一页"' ).extract ()[0 ].encode ('utf-8' )
83
+ page_last_url = response .xpath (u'.//div/a[@class="page-ch" and text()="下一页"]/@href' ).extract ()[0 ].encode ('utf-8' )
82
84
item ['image_down_url' ] = response .xpath (u'//div[@class="content-pic"]/a/img/@src' ).extract ()[0 ].encode ('utf-8' )
83
- if (int (is_page_last ) == 1 ):
84
- yield Request (self .main_url + category_code + '/' + page_last_url , meta = {'item' : item , 'path' : path }, callback = self .parse_two )
85
- file_name = re .sub (r'[^0-9]' , '' , str (datetime .datetime .now ()))
86
- item ['path' ] = path + '/' + file_name + '.jpg'
87
- item ['image_url' ] = item ['image_url' ] + '/' + file_name + '.jpg'
85
+ file_name = re .sub (r'[^0-9]' , '' , str (datetime .datetime .now ()))
86
+ item ['file_path' ] = item ['dir_path' ] + '/' + file_name + '.jpg'
87
+ item ['image_url' ] = item ['image_url_dir' ] + '/' + file_name + '.jpg'
88
88
#判断是否有下一页
89
- print item
90
- yield item
89
+ yield item
90
+ if (int (is_page_last ) == 1 ):
91
+ yield Request (self .main_url + category_code + '/' + page_last_url , meta = {'item' : item }, callback = self .parse_two )
0 commit comments