Skip to content

Commit 4b2cd8e

Browse files
committed
Added an import task using Spidr.
0 parents  commit 4b2cd8e

File tree

2 files changed

+63
-0
lines changed

2 files changed

+63
-0
lines changed

Rakefile

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Dir['_tasks/*.rb'].each { |path| require_relative(path) }

_tasks/import.rb

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
gem 'spidr', '~> 0.3'
2+
require 'spidr'
3+
require 'fileutils'
4+
5+
desc 'Spiders ruby-lang.org and imports HTML content'
6+
task :import do
7+
Spidr.site('http://www.ruby-lang.org/index.html') do |agent|
8+
agent.ignore_links_like /\/cgi-bin\//
9+
agent.ignore_links_like /\.cgi[\/]?$/
10+
agent.ignore_links_like /\/[a-z_]+\/old-man\//
11+
12+
agent.every_failed_url do |url|
13+
puts "Not Found #{url}!"
14+
end
15+
16+
agent.every_ok_page do |page|
17+
local_path = page.url.path[1..-1]
18+
19+
if local_path[-1..-1] == '/'
20+
local_path += 'index.html'
21+
elsif File.extname(local_path) == ''
22+
local_path += '/index.html'
23+
end
24+
25+
# ensure the parent directory exists
26+
FileUtils.mkdir_p(File.dirname(local_path))
27+
28+
unless File.exist?(local_path)
29+
puts "Saving #{page.url} -> #{local_path} ..."
30+
31+
File.open(local_path,'wb') do |file|
32+
if page.html?
33+
layout = 'default'
34+
title = page.title
35+
page_div = page.doc.at('#page')
36+
37+
if (header = page_div.at('#head'))
38+
if (header.inner_text.strip == title)
39+
layout = 'page'
40+
end
41+
end
42+
43+
file.puts(
44+
'---',
45+
"layout: #{layout}",
46+
"title: #{title}",
47+
'---'
48+
)
49+
50+
if (layout == 'default' && page_div)
51+
file.puts(page_div.to_html)
52+
elsif (content_div = page_div.at('#content'))
53+
file.puts(content_div.inner_html)
54+
end
55+
else
56+
file.write(page.body)
57+
end
58+
end
59+
end
60+
end
61+
end
62+
end

0 commit comments

Comments
 (0)