Skip to content

Commit fb0f702

Browse files
committed
add log support for scrapy
1 parent b05418d commit fb0f702

File tree

5 files changed

+18
-6
lines changed

5 files changed

+18
-6
lines changed

config/rules.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,13 @@
362362
'internal': 20,
363363
'enable': 1,
364364
},
365+
{
366+
'name': 'zhihu',
367+
'task_queue': TEMP_ZHIHU_QUEUE,
368+
'resource': VALIDATED_ZHIHU_QUEUE,
369+
'internal': 20,
370+
'enable': 1,
371+
},
365372
]
366373

367374
# crawlers will fetch tasks from the following queues

config/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444

4545
# scrapy log settings
4646
LOG_LEVEL = 'DEBUG'
47+
LOG_FILE = 'logs/haipproxy.log'
4748

4849

4950
#####################################################################

crawler/redis_spiders.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@
77
from scrapy.spiders import (
88
Spider, CrawlSpider)
99
from scrapy_splash import SplashRequest
10+
from scrapy.utils.log import configure_logging
1011

1112
from logger import crawler_logger
1213
from utils import get_redis_conn
1314
from config.settings import (
1415
VALIDATOR_FEED_SIZE, SPIDER_FEED_SIZE)
1516

17+
18+
configure_logging(install_root_handler=True)
1619
__all__ = ['RedisSpider', 'RedisAjaxSpider',
1720
'RedisCrawlSpider', 'ValidatorRedisSpider']
1821

logger/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
if not os.path.exists(log_dir):
1313
os.mkdir(log_dir)
1414

15-
log_path = os.path.join(log_dir, 'haiproxy.log')
15+
log_path = os.path.join(log_dir, 'haipproxy.log')
1616

1717
log_config = {
1818
'version': 1.0,

scheduler/scheduler.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def schedule_task_with_lock(self, task):
113113
pipe.lpush(task_queue, *urls)
114114
pipe.hset(TIMER_RECORDER, task_name, now)
115115
pipe.execute()
116-
scheduler_logger('crawler task {} has been stored into redis successfully'.format(task_name))
116+
scheduler_logger.info('crawler task {} has been stored into redis successfully'.format(task_name))
117117
return True
118118
else:
119119
return None
@@ -146,13 +146,13 @@ def schedule_task_with_lock(self, task):
146146
r, proxies = pipe.execute()
147147
if not r or (now - int(r.decode('utf-8'))) >= internal * 60:
148148
if not proxies:
149-
scheduler_logger('fetched no proxies from task {}'.format(task_name))
149+
scheduler_logger.warning('fetched no proxies from task {}'.format(task_name))
150150
return None
151151

152152
pipe.sadd(task_queue, *proxies)
153153
pipe.hset(TIMER_RECORDER, task_name, now)
154154
pipe.execute()
155-
scheduler_logger('validator task {} has been stored into redis successfully'.format(task_name))
155+
scheduler_logger.info('validator task {} has been stored into redis successfully'.format(task_name))
156156
return True
157157
else:
158158
return None
@@ -165,6 +165,7 @@ def schedule_task_with_lock(self, task):
165165
@click.argument('task_queues', nargs=-1)
166166
def scheduler_start(usage, task_queues):
167167
"""Start specified scheduler."""
168+
scheduler_logger.info('{} scheduler is starting...'.format(usage))
168169
default_tasks = CRWALER_TASKS if usage == 'crawler' else VALIDATOR_TASKS
169170
default_allow_tasks = DEFAULT_CRAWLER_TASKS if usage == 'crawler' else DEFAULT_VALIDATORS_TASKS
170171
maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
@@ -177,7 +178,7 @@ def scheduler_start(usage, task_queues):
177178
for task_queue in task_queues:
178179
allow_task_queue = maps.get(task_queue)
179180
if not allow_task_queue:
180-
scheduler_logger('scheduler task {} is invalid task, the allowed tasks are {}'.format(
181+
scheduler_logger.warning('scheduler task {} is an invalid task, the allowed tasks are {}'.format(
181182
task_queue, list(maps.keys())))
182183
continue
183184
scheduler.task_queues.append(allow_task_queue)
@@ -207,7 +208,7 @@ def crawler_start(usage, tasks):
207208
spiders.append(case.spider)
208209
break
209210
else:
210-
crawler_logger.warning('spider task {} is invalid task, the allowed tasks are {}'.format(
211+
crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
211212
task, list(maps.keys())))
212213
if not spiders:
213214
crawler_logger.warning('no spider starts up, please check your task input')

0 commit comments

Comments
 (0)