Auf zwei Planeten: Repariere Konvertierungsfehler und Kleinbuchstaben am Satzanfang
[ccbib.git] / pandocTools / preformat.hs
blobd353e710ea6fcccaf9c54d6f8c38f125ab8ad50f
1 -- preformat.hs
2 --
3 -- inspects HTML to find special formatting
4 -- cleanup HTML to make it more pandoc friendly
6 import Data.List
7 import Data.List.Split
8 import Data.Maybe
9 import Control.Monad
10 import Text.HTML.TagSoup
11 import Text.HTML.TagSoup.Tree
12 import Char
13 import System.IO
14 import System.Environment (getArgs)
16 opendoublequote = '\x8220'
17 closingdoublequote = '\x8221'
19 data Setup = Setup {
20 input :: Maybe String,
21 output :: Maybe String,
22 report1 :: Maybe String,
23 report2 :: Maybe String,
24 rules :: [TagTree String -> [TagTree String]]
27 defaultSetup = Setup Nothing Nothing Nothing Nothing []
29 readSetup :: [String] -> Setup -> Setup
30 readSetup [] = id
31 readSetup (x:xs) = (readSetup xs) . (parseSetup (words x))
33 parseSetup :: [String] -> Setup -> Setup
34 parseSetup (('#':_):_) x = x
35 parseSetup [] x = x
36 parseSetup ("output":file:_) s = s {output = Just file}
37 parseSetup ("input":file:_) s = s {input = Just file}
38 parseSetup ("report1":file:_) s = s {report1 = Just file}
39 parseSetup ("report2":file:_) s = s {report2 = Just file }
40 parseSetup ("fixquotes":_) s = s {rules = (textfilter fixquotes : rules s)}
41 parseSetup ("ignore":tag) s = s {rules = (ignore tag : rules s)}
42 parseSetup ("remove":tag) s = s {rules = (remove tag : rules s)}
43 parseSetup ("replace":sel:templ:_) s = s {rules = (replace sel templ : rules s)}
44 parseSetup ("killattr":sel:attr) s = s {rules = (killattr sel attr : rules s)}
45 parseSetup line _ = error ("Can't parse: " ++ (unwords line))
47 textfilter :: (String -> String) -> TagTree String -> [TagTree String]
48 textfilter f (TagLeaf (TagText s)) = [TagLeaf (TagText (f s))]
49 textfilter _ t = [t]
51 fixquotes :: String -> String
52 fixquotes ('"' : c : s)
53 | or [isSpace c, elem c ").,;"] = closingdoublequote : c : fixquotes s
54 | isAlpha c = opendoublequote : c : fixquotes s
55 | otherwise = '"' : c : fixquotes s
56 fixquotes (c : '"' : s)
57 | or [isSpace c, c == '('] = c : opendoublequote : fixquotes s
58 | and [isAlpha c, (not . isAlpha . head) s] =
59 c : closingdoublequote : fixquotes s
60 | otherwise = c : '"' : fixquotes s
61 fisquotes (c : s) = c : fixquotes s
62 --fixquotes "" = ""
64 ignore :: [String] -> TagTree String -> [TagTree String]
65 ignore s t = if match t s
66 then tagContent t
67 else [t]
69 remove :: [String] -> TagTree String -> [TagTree String]
70 remove s t = if match t s
71 then []
72 else [t]
74 killattr :: String -> [String] -> TagTree String -> [TagTree String]
75 killattr selector as t = if selects (splitOn ";" selector) t
76 then [filterAttrs t]
77 else [t]
78 where
79 filterAttrs (TagBranch name attrs c) =
80 (TagBranch name (filter pred attrs) c)
81 filterAttrs (TagLeaf (TagOpen name attrs)) =
82 (TagLeaf (TagOpen name (filter pred attrs)))
83 filterAttrs t = t
84 pred x = not (elem (fst x) as)
86 replace :: String -> String -> TagTree String -> [TagTree String]
87 replace selector template t = if selects (splitOn ";" selector) t
88 then [write (map attrPair (splitOn ";" template)) t]
89 else [t]
90 where
91 write (("tag", n):xs) (TagLeaf (TagOpen _ as)) =
92 (TagLeaf (TagOpen n (h as xs)))
93 write xs (TagLeaf (TagOpen n as)) =
94 (TagLeaf (TagOpen n (h as xs)))
95 write (("tag", n):xs) (TagBranch _ as c) =
96 (TagBranch n (h as xs) c)
97 write xs (TagBranch n as c) = (TagBranch n (h as xs) c)
98 h as = mapMaybe (h2 as)
99 h2 as (i,[]) = liftM (\s -> (i,s)) $ lookup i as
100 h2 _ p = Just p
103 selects :: [String] -> TagTree String -> Bool
104 selects xs t = all (matches t) (map attrPair xs)
105 where
106 matches (TagBranch name as _) p@(e1, e2)
107 | e1 == "tag" = e2 == name
108 | e2 == [] = elem e1 $ map fst as
109 | otherwise = elem p as
110 matches (TagLeaf (TagOpen name as)) p@(e1, e2)
111 | e1 == "tag" = e2 == name
112 | e2 == [] = elem e1 $ map fst as
113 | otherwise = elem p as
114 matches _ _ = False
116 attrPair :: String -> (String, String)
117 attrPair = h . (break (== '='))
118 where
119 h (e1,('=':e2)) = (e1, e2)
120 h p = p
123 tagContent :: TagTree String -> [TagTree String]
124 tagContent (TagBranch _ _ ts) = ts
125 tagContent (TagLeaf _) = []
127 match :: TagTree String -> [String] -> Bool
128 match t s = alike t ((head . canonicalizeTags . parseTags . unwords) s)
130 alike :: TagTree String -> Tag String -> Bool
131 alike (TagBranch s1 a1 _) (TagOpen s2 a2) = s1 == s2 && a1 == a2
132 alike (TagLeaf t1) t2 = t1 == t2
133 alike _ _ = False
135 unlevel :: [TagTree String] -> ([Tag String], [TagTree String])
136 unlevel [] = ([], [])
137 unlevel (x:xs) = let (as, bs) = unlevel xs
138 (t, ls) = strip x
139 in ((t:as), ls++bs)
141 strip :: TagTree String -> (Tag String, [TagTree String])
142 strip (TagLeaf x) = (x, [])
143 strip (TagBranch n as ts) = (TagOpen n as, ts)
145 sortByLevel :: [TagTree String] -> [[Tag String]]
146 sortByLevel [] = [[]]
147 sortByLevel ts = let (cs, ns) = unlevel ts
148 in ((filter isTagOpen cs) : sortByLevel ns)
150 histogram :: [Tag String] -> [(Tag String, Int)]
151 histogram xs = let ts = group $ sort xs
152 in zip (map head ts) (map length ts)
154 displayStats :: [[(Tag String, Int)]] -> [String]
155 displayStats xs = let headers = map ("\nLevel " ++) (map show [1..])
156 in concat $ map (uncurry (:)) (zip headers (map (map format) xs))
157 where format (t, n) = show n ++ "\t" ++ renderTags [t]
159 helptext = "Commandlinearguments are supposed to be control files and\nshould not start with dashes.\n"
161 parseArgs :: [String] -> Setup -> IO Setup
162 parseArgs [] s = return s
163 parseArgs (('-':_):_) _ = error helptext
164 --parseArgs (a:as) = ((readFile a) >>=) ((parseArgs as) . readSetup . lines)
165 parseArgs (a:as) s = do
166 c <- readFile a
167 parseArgs as (readSetup (lines c) s)
169 report :: [TagTree String] -> String
170 report = unlines . displayStats . (map histogram) . sortByLevel
172 main :: IO ()
173 main = do
174 args <- getArgs
175 setup <- parseArgs args defaultSetup
176 raw <- maybe getContents readFile (input setup)
177 let origTree = (tagTree . canonicalizeTags . parseTags) raw
178 if null args
179 then hPutStr stdout (report origTree)
180 else return ()
181 maybe (return ()) (flip writeFile (report origTree)) (report1 setup)
182 let newTree = foldr transformTree origTree (rules setup)
183 maybe (return ()) (\n -> (writeFile n) . renderTags . flattenTree $ newTree) (output setup)
184 case (report2 setup) of
185 Just n -> writeFile n (report newTree)
186 Nothing -> return ()