website: Rearranged main page
[ccbib.git] / pandocTools / tagsummary.hs
blob6bc98068b8ffb1d168535ba0390f3298e9a631ba
1 -- extractByTag.hs
2 --
3 -- Extract any text that has been markup up with a given tag.
5 import Text.HTML.TagSoup
6 import System.IO
7 import System.Environment (getArgs)
8 import Data.List.HT (breakAfter)
9 import Data.List (foldl')
11 import Data.Map (Map)
12 import qualified Data.Map as Map
15 data Entry = Entry Int (Tag String)
16 deriving (Show, Eq)
18 instance Num Entry where
19 (+) (Entry i a) (Entry j _) = Entry (i+j) a
21 instance Ord Entry where
22 (>=) (Entry i1 a1) (Entry i2 a2)
23 | i1 == i2 = (>=) a1 a2
24 | otherwise = (>=) i1 i2
25 (<=) (Entry i1 a1) (Entry i2 a2)
26 | i1 == i2 = (<=) a1 a2
27 | otherwise = (<=) i1 i2
29 data State = State Int (Map Entry Int)
30 deriving (Show)
32 stateGetMap :: State -> (Map Entry Int)
33 stateGetMap (State lev m) = m
35 checkArgs :: [String] -> Bool
36 checkArgs x = any (elem '-') x
38 helpstring = "Bad Arguments!\n"
40 main :: IO ()
41 main = do
42 args <- getArgs
43 if (checkArgs args)
44 then hPutStr stderr helpstring
45 else interact (unlines . report)
47 report :: String -> [String]
48 report = reportLines . scan . canonicalizeTags . parseTags
50 reportLines :: (Map Entry Int) -> [String]
51 reportLines m = map fmt (Map.toAscList m)
52 where fmt ((Entry l t), n) =
53 show l ++ "\t" ++ show n ++ "\t" ++ renderTags [t]
55 scan :: [Tag String] -> (Map Entry Int)
56 scan = stateGetMap . foldl' update (State 0 Map.empty)
57 where
58 update (State lev m) t
59 | isStartTag t = State (lev+1) (insert t lev m)
60 | isStopTag t = State (lev-1) (insert t (lev-1) m)
61 | isHtmlTag t = State lev (insert t lev m)
62 | otherwise = State lev m
63 insert t l = Map.insertWith (+) (Entry l t) 1
64 -- update :: Tag String -> State -> State
67 isStartTag :: Tag String -> Bool
68 isStartTag (TagOpen str _) = elem str leveltags
69 isStartTag _ = False
71 isStopTag :: Tag String -> Bool
72 isStopTag (TagClose str) = elem str leveltags
73 isStopTag _ = False
75 isHtmlTag :: Tag String -> Bool
76 isHtmlTag (TagOpen _ _) = True
77 isHtmlTag (TagClose _) = True
78 isHtmlTag _ = False
80 leveltags = ["html", "head", "title", "body",
81 "h1", "h2", "h3", "h4", "h5", "h6", "a",
82 "b", "strong", "i", "tt", "em", "font", "u", "small", "strike",
83 "sub", "sup", "big", "span",
84 "code", "pre", "cite", "blockquote", "center", "div",
85 "table", "ol", "ul"]
86 -- Not sure about, "p", ...