much love
[mu.git] / tools / linkify.cc
blobece50748fb8b0f20410b08c38c1dfc2d2471b81d
1 // Read a tabular cross-reference file generated by ctags, then read a list of
2 // html files generated by Vim's TOhtml command on C++ code. Link words
3 // in the html files to cross-references from ctags.
5 // Usage:
6 // linkify [tags file] [html files]...
8 // Still plenty of holes:
9 // - unnecessarily linking definition location to itself
10 // - except SubX definitions, which start at start of line
11 // - can't detect strings in spite of attempt to support them below, because
12 // Vim's generated html turns quotes into html entities
13 // - distinguishing function and variable names
14 // - distinguishing Mu code in C++ files
15 // - distinguishing between function overloads
16 // - if there's duplicate tags we aren't smart enough to distinguish between
17 // them yet, so we simply don't add any link at all
18 // - but even that's not perfect, because sometimes the tags file has a
19 // single definition but there's still multiple overloads (say I defined
20 // 'clear()' on some type, and it's already defined on STL classes)
21 // - ctags misses some symbols in layered code
23 #include<assert.h>
25 #include<map>
26 using std::map;
28 #include<string>
29 using std::string;
31 #include<iostream>
32 using std::istream;
33 using std::cout;
34 using std::cerr;
36 #include<sstream>
37 using std::istringstream;
38 using std::ostringstream;
40 #include<fstream>
41 using std::ifstream;
42 using std::ofstream;
44 #include <locale>
45 using std::isspace; // unicode-aware
47 struct syminfo {
48 string filename;
49 int line_num;
50 syminfo() :line_num(0) {}
53 bool has_data(istream& in) {
54 in.peek();
55 if (in.eof()) return false;
56 assert(in);
57 return true;
60 bool starts_with(const string& s, const string& pat) {
61 string::const_iterator a=s.begin(), b=pat.begin();
62 for (/*nada*/; a!=s.end() && b!=pat.end(); ++a, ++b)
63 if (*a != *b) return false;
64 return b == pat.end();
67 bool ends_with(const string& s, const string& pat) {
68 string::const_reverse_iterator a=s.rbegin(), b=pat.rbegin();
69 for (/*nada*/; a!=s.rend() && b!=pat.rend(); ++a, ++b)
70 if (*a != *b) return false;
71 return b == pat.rend();
74 void encode_some_html_entities(string& s) {
75 std::string::size_type pos = 0;
76 while (true) {
77 pos = s.find_first_of("<>", pos);
78 if (pos == std::string::npos) break;
79 std::string replacement;
80 switch (s.at(pos)) {
81 case '<': replacement = "&lt;"; break;
82 case '>': replacement = "&gt;"; break;
84 s.replace(pos, 1, replacement);
85 pos += replacement.size();
89 void read_tags(const string& filename, map<string, syminfo>& info) {
90 ifstream in(filename.c_str());
91 //? cerr << "reading " << filename << '\n';
92 string dummy;
93 while (has_data(in)) {
94 string symbol; in >> symbol;
95 if (symbol == "operator") {
96 // unsupported
97 getline(in, dummy); // skip
98 continue;
100 encode_some_html_entities(symbol);
101 //? cerr << symbol << '\n';
102 if (info.find(symbol) != info.end()) {
103 info[symbol].line_num = -1;
104 info[symbol].filename.clear();
106 else {
107 in >> dummy;
108 in >> info[symbol].line_num;
109 in >> info[symbol].filename;
111 getline(in, dummy); // skip rest of line
112 //? cerr << symbol << ": " << info[symbol].filename << ':' << info[symbol].line_num << '\n';
114 in.close();
117 void replace_tags_in_file(const string& filename, const map<string, syminfo>& info) {
118 //? cerr << info.size() << " symbols\n";
119 ifstream in(filename.c_str());
120 ofstream out((filename+".out").c_str());
121 while (has_data(in)) {
122 // send lines that don't start with '<span' straight through
123 string line;
124 getline(in, line);
125 if (!starts_with(line, "<span ")) {
126 out << line << '\n';
128 else {
129 static int span_size = string("</span>").size();
130 int skip_first_span = line.find("</span>") + span_size;
131 out << line.substr(0, skip_first_span);
132 istringstream in2(line.substr(skip_first_span));
133 in2 >> std::noskipws;
134 // only in .subx files, refuse to linkify the first word on a line
135 bool at_start_of_line = ends_with(filename, ".subx.html");
136 //? cerr << filename << ": " << at_start_of_line << '\n';
137 while (has_data(in2)) {
138 if (isspace(in2.peek())) {
139 //? cerr << "space\n";
140 char c; in2 >> c;
141 out << c;
142 at_start_of_line = false;
144 // within a line, send straight through all characters inside '<..>'
145 else if (in2.peek() == '<') {
146 //? cerr << "tag\n";
147 char c = '\0';
148 while (in2 >> c) {
149 //? cerr << "span: " << c << '\n';
150 out << c;
151 if (c == '>') break;
153 // don't include initial tag when computing 'at_start_of_line'
154 //? cerr << "end tag\n";
156 else {
157 // send straight through all characters inside strings (handling escapes)
158 char c = in2.get();
159 if (c == '"') {
160 //? cerr << "string\n";
161 out << c;
162 while (in2 >> c) {
163 out << c;
164 if (c == '\\') {
165 in2 >> c; out << c;
167 else if (c == '"') {
168 break;
171 at_start_of_line = false;
173 else if (c == '\'') {
174 //? cerr << "character\n";
175 out << c;
176 while (in2 >> c) {
177 out << c;
178 if (c == '\\') {
179 in2 >> c; out << c;
181 else if (c == '\'') {
182 break;
185 at_start_of_line = false;
187 // send straight through any characters after '#' (comments)
188 else if (c == '#') {
189 //? cerr << "comment\n";
190 out << c;
191 while (in2 >> c) out << c;
192 at_start_of_line = false;
194 // send straight through any characters after '//' (comments)
195 else if (c == '/' && in2.peek() == '/') {
196 //? cerr << "comment\n";
197 out << c;
198 while (in2 >> c) out << c;
199 at_start_of_line = false;
201 // send through open parens at start of line
202 else if (c == '(') {
203 out << c;
204 at_start_of_line = false;
206 else if (c == ')') {
207 out << c;
208 at_start_of_line = false;
210 else {
211 //? cerr << "rest\n";
212 if (c == ',' || c == ':') {
213 out << c;
214 at_start_of_line = false;
215 continue;
217 ostringstream out2;
218 out2 << c;
219 while (in2 >> c) {
220 if (isspace(c) || c == '<' || c == '"' || c == '\'' || c == '/' || c == ',' || c == ':' || c == '(' || c == ')') { // keep sync'd with other clauses above
221 in2.putback(c);
222 break;
224 out2 << c;
226 string symbol = out2.str();
227 if (symbol == "equal" || symbol == "index" || symbol == "put-index" || symbol == "length") {
228 //? cerr << " blacklisted\n";
229 out << symbol;
231 else if (info.find(symbol) == info.end()) {
232 //? cerr << " no info\n";
233 out << symbol;
235 else {
236 const syminfo& s = info.find(symbol)->second;
237 if (s.filename.empty()) {
238 //? cerr << " empty info\n";
239 out << symbol;
241 else {
242 if (at_start_of_line) {
243 //? cerr << " at start of line; refusing to linkify " << symbol << "\n";
244 out << symbol;
246 else {
247 //? cerr << " link\n";
248 out << "<a href='" << s.filename << ".html#L" << s.line_num << "'>" << symbol << "</a>";
252 } // end rest
254 } // done parsing line
255 out << '\n';
258 in.close(); out.close();
261 int main(int argc, const char* argv[]) {
262 map<string, syminfo> info;
263 read_tags(argv[1], info);
264 for (int i = 2; i < argc; ++i)
265 replace_tags_in_file(argv[i], info);
266 return 0;