3 -- inspects HTML to find special formatting
4 -- cleanup HTML to make it more pandoc friendly
10 import Text
.HTML
.TagSoup
11 import Text
.HTML
.TagSoup
.Tree
14 import System
.Environment
(getArgs)
16 opendoublequote
= '\x8220
'
17 closingdoublequote
= '\x8221
'
20 input
:: Maybe String,
21 output
:: Maybe String,
22 report1
:: Maybe String,
23 report2
:: Maybe String,
24 rules
:: [TagTree
String -> [TagTree
String]]
27 defaultSetup
= Setup Nothing Nothing Nothing Nothing
[]
29 readSetup
:: [String] -> Setup
-> Setup
31 readSetup
(x
:xs
) = (readSetup xs
) . (parseSetup
(words x
))
33 parseSetup
:: [String] -> Setup
-> Setup
34 parseSetup
(('#':_
):_
) x
= x
36 parseSetup
("output":file
:_
) s
= s
{output
= Just file
}
37 parseSetup
("input":file
:_
) s
= s
{input
= Just file
}
38 parseSetup
("report1":file
:_
) s
= s
{report1
= Just file
}
39 parseSetup
("report2":file
:_
) s
= s
{report2
= Just file
}
40 parseSetup
("fixquotes":_
) s
= s
{rules
= (textfilter fixquotes
: rules s
)}
41 parseSetup
("ignore":tag
) s
= s
{rules
= (ignore tag
: rules s
)}
42 parseSetup
("remove":tag
) s
= s
{rules
= (remove tag
: rules s
)}
43 parseSetup
("replace":sel
:templ
:_
) s
= s
{rules
= (replace sel templ
: rules s
)}
44 parseSetup
("killattr":sel
:attr
) s
= s
{rules
= (killattr sel attr
: rules s
)}
45 parseSetup line _
= error ("Can't parse: " ++ (unwords line
))
47 textfilter
:: (String -> String) -> TagTree
String -> [TagTree
String]
48 textfilter f
(TagLeaf
(TagText s
)) = [TagLeaf
(TagText
(f s
))]
51 fixquotes
:: String -> String
52 fixquotes
('"' : c : s)
53 | or [isSpace c, elem c ").,;"] = closingdoublequote : c : fixquotes s
54 | isAlpha c = opendoublequote : c : fixquotes s
55 | otherwise = '"' : c
: fixquotes s
56 fixquotes
(c
: '"' : s)
57 | or [isSpace c, c == '('] = c : opendoublequote : fixquotes s
58 | and [isAlpha c, (not . isAlpha . head) s] =
59 c : closingdoublequote : fixquotes s
60 | otherwise = c : '"' : fixquotes s
61 fisquotes
(c
: s
) = c
: fixquotes s
64 ignore
:: [String] -> TagTree
String -> [TagTree
String]
65 ignore s t
= if match t s
69 remove
:: [String] -> TagTree
String -> [TagTree
String]
70 remove s t
= if match t s
74 killattr
:: String -> [String] -> TagTree
String -> [TagTree
String]
75 killattr selector
as t
= if selects
(splitOn
";" selector
) t
79 filterAttrs
(TagBranch name attrs c
) =
80 (TagBranch name
(filter pred attrs
) c
)
81 filterAttrs
(TagLeaf
(TagOpen name attrs
)) =
82 (TagLeaf
(TagOpen name
(filter pred attrs
)))
84 pred x
= not (elem (fst x
) as)
86 replace
:: String -> String -> TagTree
String -> [TagTree
String]
87 replace selector template t
= if selects
(splitOn
";" selector
) t
88 then [write
(map attrPair
(splitOn
";" template
)) t
]
91 write
(("tag", n
):xs
) (TagLeaf
(TagOpen _
as)) =
92 (TagLeaf
(TagOpen n
(h
as xs
)))
93 write xs
(TagLeaf
(TagOpen n
as)) =
94 (TagLeaf
(TagOpen n
(h
as xs
)))
95 write
(("tag", n
):xs
) (TagBranch _
as c
) =
96 (TagBranch n
(h
as xs
) c
)
97 write xs
(TagBranch n
as c
) = (TagBranch n
(h
as xs
) c
)
98 h
as = mapMaybe (h2
as)
99 h2
as (i
,[]) = liftM (\s
-> (i
,s
)) $ lookup i
as
103 selects
:: [String] -> TagTree
String -> Bool
104 selects xs t
= all (matches t
) (map attrPair xs
)
106 matches
(TagBranch name
as _
) p
@(e1
, e2
)
107 | e1
== "tag" = e2
== name
108 | e2
== [] = elem e1
$ map fst as
109 |
otherwise = elem p
as
110 matches
(TagLeaf
(TagOpen name
as)) p
@(e1
, e2
)
111 | e1
== "tag" = e2
== name
112 | e2
== [] = elem e1
$ map fst as
113 |
otherwise = elem p
as
116 attrPair
:: String -> (String, String)
117 attrPair
= h
. (break (== '='))
119 h
(e1
,('=':e2
)) = (e1
, e2
)
123 tagContent
:: TagTree
String -> [TagTree
String]
124 tagContent
(TagBranch _ _ ts
) = ts
125 tagContent
(TagLeaf _
) = []
127 match
:: TagTree
String -> [String] -> Bool
128 match t s
= alike t
((head . canonicalizeTags
. parseTags
. unwords) s
)
130 alike
:: TagTree
String -> Tag
String -> Bool
131 alike
(TagBranch s1 a1 _
) (TagOpen s2 a2
) = s1
== s2
&& a1
== a2
132 alike
(TagLeaf t1
) t2
= t1
== t2
135 unlevel
:: [TagTree
String] -> ([Tag
String], [TagTree
String])
136 unlevel
[] = ([], [])
137 unlevel
(x
:xs
) = let (as, bs
) = unlevel xs
141 strip
:: TagTree
String -> (Tag
String, [TagTree
String])
142 strip
(TagLeaf x
) = (x
, [])
143 strip
(TagBranch n
as ts
) = (TagOpen n
as, ts
)
145 sortByLevel
:: [TagTree
String] -> [[Tag
String]]
146 sortByLevel
[] = [[]]
147 sortByLevel ts
= let (cs
, ns
) = unlevel ts
148 in ((filter isTagOpen cs
) : sortByLevel ns
)
150 histogram
:: [Tag
String] -> [(Tag
String, Int)]
151 histogram xs
= let ts
= group $ sort xs
152 in zip (map head ts
) (map length ts
)
154 displayStats
:: [[(Tag
String, Int)]] -> [String]
155 displayStats xs
= let headers
= map ("\nLevel " ++) (map show [1..])
156 in concat $ map (uncurry (:)) (zip headers
(map (map format
) xs
))
157 where format
(t
, n
) = show n
++ "\t" ++ renderTags
[t
]
159 helptext
= "Commandlinearguments are supposed to be control files and\nshould not start with dashes.\n"
161 parseArgs
:: [String] -> Setup
-> IO Setup
162 parseArgs
[] s
= return s
163 parseArgs
(('-':_
):_
) _
= error helptext
164 --parseArgs (a:as) = ((readFile a) >>=) ((parseArgs as) . readSetup . lines)
165 parseArgs
(a
:as) s
= do
167 parseArgs
as (readSetup
(lines c
) s
)
169 report
:: [TagTree
String] -> String
170 report
= unlines . displayStats
. (map histogram
) . sortByLevel
175 setup
<- parseArgs args defaultSetup
176 raw
<- maybe getContents readFile (input setup
)
177 let origTree
= (tagTree
. canonicalizeTags
. parseTags
) raw
179 then hPutStr stdout (report origTree
)
181 maybe (return ()) (flip writeFile (report origTree
)) (report1 setup
)
182 let newTree
= foldr transformTree origTree
(rules setup
)
183 maybe (return ()) (\n -> (writeFile n
) . renderTags
. flattenTree
$ newTree
) (output setup
)
184 case (report2 setup
) of
185 Just n
-> writeFile n
(report newTree
)