File tree 2 files changed +63
-0
lines changed
2 files changed +63
-0
lines changed Original file line number Diff line number Diff line change
1
+ Dir [ '_tasks/*.rb' ] . each { |path | require_relative ( path ) }
Original file line number Diff line number Diff line change
1
+ gem 'spidr' , '~> 0.3'
2
+ require 'spidr'
3
+ require 'fileutils'
4
+
5
+ desc 'Spiders ruby-lang.org and imports HTML content'
6
+ task :import do
7
+ Spidr . site ( 'http://www.ruby-lang.org/index.html' ) do |agent |
8
+ agent . ignore_links_like /\/ cgi-bin\/ /
9
+ agent . ignore_links_like /\. cgi[\/ ]?$/
10
+ agent . ignore_links_like /\/ [a-z_]+\/ old-man\/ /
11
+
12
+ agent . every_failed_url do |url |
13
+ puts "Not Found #{ url } !"
14
+ end
15
+
16
+ agent . every_ok_page do |page |
17
+ local_path = page . url . path [ 1 ..-1 ]
18
+
19
+ if local_path [ -1 ..-1 ] == '/'
20
+ local_path += 'index.html'
21
+ elsif File . extname ( local_path ) == ''
22
+ local_path += '/index.html'
23
+ end
24
+
25
+ # ensure the parent directory exists
26
+ FileUtils . mkdir_p ( File . dirname ( local_path ) )
27
+
28
+ unless File . exist? ( local_path )
29
+ puts "Saving #{ page . url } -> #{ local_path } ..."
30
+
31
+ File . open ( local_path , 'wb' ) do |file |
32
+ if page . html?
33
+ layout = 'default'
34
+ title = page . title
35
+ page_div = page . doc . at ( '#page' )
36
+
37
+ if ( header = page_div . at ( '#head' ) )
38
+ if ( header . inner_text . strip == title )
39
+ layout = 'page'
40
+ end
41
+ end
42
+
43
+ file . puts (
44
+ '---' ,
45
+ "layout: #{ layout } " ,
46
+ "title: #{ title } " ,
47
+ '---'
48
+ )
49
+
50
+ if ( layout == 'default' && page_div )
51
+ file . puts ( page_div . to_html )
52
+ elsif ( content_div = page_div . at ( '#content' ) )
53
+ file . puts ( content_div . inner_html )
54
+ end
55
+ else
56
+ file . write ( page . body )
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
You can’t perform that action at this time.
0 commit comments