Wikipediaの全言語articleダンプをダウンロードするスクリプト。
{-# LANGUAGE OverloadedStrings, QuasiQuotes #-}
import Control.Applicative
import Control.Monad
import qualified Data.ByteString.Lazy.Char8 as B
import Network.HTTP.Conduit
import System.Cmd
import System.Environment
import System.Process.QQ
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Tree
import Text.Printf
import Text.Regex.TDFA
baseUrl = "http://dumps.wikimedia.org/"
extractLinks url regex = do
body <- B.unpack <$> simpleHttp url
let tree = tagTree $ parseTags body
return
[ (name, link)
| (TagBranch "a" attrs [TagLeaf (TagText name)]) <- universeTree tree
, name =~ (regex :: String)
, let Just link = lookup "href" attrs
]
main :: IO ()
main = do
args <- getArgs
case args of
[dest] -> do
[cmd|rm -rf #{dest}|]
[cmd|mkdir -p #{dest}|]
langs <- extractLinks (baseUrl ++ "backup-index.html") ".+wiki$"
forM_ (zip [1..] langs) $ \(ix, (name, url)) -> do
printf "[%d/%d]: %s\n" (ix :: Int) (length langs) name
links <- extractLinks (baseUrl ++ url) ".+pages-articles\\.xml\\..*"
forM_ links $ \(name, url) -> do
let aurl = baseUrl ++ url
putStrLn $ "> " ++ name ++ ": " ++ url
system $ "aria2c -d " ++ dest ++ " " ++ baseUrl ++ url
_ -> putStrLn "usage: runhaskell main.hs <dest-dir>"
tagsoup意外と使いやすかった。
それにしてもimportが多くなる…