tools/linkify.cc

   1 // Read a tabular cross-reference file generated by ctags, then read a list of
   2 // html files generated by Vim's TOhtml command on C++ code. Link words
   3 // in the html files to cross-references from ctags.
   4
   5 // Usage:
   6 //    linkify [tags file] [html files]...
   7
   8 // Still plenty of holes:
   9 // - unnecessarily linking definition location to itself
  10 //   - except SubX definitions, which start at start of line
  11 // - can't detect strings in spite of attempt to support them below, because
  12 //   Vim's generated html turns quotes into html entities
  13 // - distinguishing function and variable names
  14 // - distinguishing Mu code in C++ files
  15 // - distinguishing between function overloads
  16 //   - if there's duplicate tags we aren't smart enough to distinguish between
  17 //     them yet, so we simply don't add any link at all
  18 //   - but even that's not perfect, because sometimes the tags file has a
  19 //     single definition but there's still multiple overloads (say I defined
  20 //     'clear()' on some type, and it's already defined on STL classes)
  21 // - ctags misses some symbols in layered code
  22
  23 #include<assert.h>
  24
  25 #include<map>
  26 using std::map;
  27
  28 #include<string>
  29 using std::string;
  30
  31 #include<iostream>
  32 using std::istream;
  33 using std::cout;
  34 using std::cerr;
  35
  36 #include<sstream>
  37 using std::istringstream;
  38 using std::ostringstream;
  39
  40 #include<fstream>
  41 using std::ifstream;
  42 using std::ofstream;
  43
  44 #include <locale>
  45 using std::isspace;  // unicode-aware
  46
  47 struct syminfo {
  48   string filename;
  49   int line_num;
  50   syminfo() :line_num(0) {}
  51 };
  52
  53 bool has_data(istream& in) {
  54   in.peek();
  55   if (in.eof()) return false;
  56   assert(in);
  57   return true;
  58 }
  59
  60 bool starts_with(const string& s, const string& pat) {
  61   string::const_iterator a=s.begin(), b=pat.begin();
  62   for (/*nada*/;  a!=s.end() && b!=pat.end();  ++a, ++b)
  63     if (*a != *b) return false;
  64   return b == pat.end();
  65 }
  66
  67 bool ends_with(const string& s, const string& pat) {
  68   string::const_reverse_iterator a=s.rbegin(), b=pat.rbegin();
  69   for (/*nada*/;  a!=s.rend() && b!=pat.rend();  ++a, ++b)
  70     if (*a != *b) return false;
  71   return b == pat.rend();
  72 }
  73
  74 void encode_some_html_entities(string& s) {
  75   std::string::size_type pos = 0;
  76   while (true) {
  77     pos = s.find_first_of("<>", pos);
  78     if (pos == std::string::npos) break;
  79     std::string replacement;
  80     switch (s.at(pos)) {
  81       case '<': replacement = "&lt;"; break;
  82       case '>': replacement = "&gt;"; break;
  83     }
  84     s.replace(pos, 1, replacement);
  85     pos += replacement.size();
  86   };
  87 }
  88
  89 void read_tags(const string& filename, map<string, syminfo>& info) {
  90   ifstream in(filename.c_str());
  91 //?   cerr << "reading " << filename << '\n';
  92   string dummy;
  93   while (has_data(in)) {
  94     string symbol;  in >> symbol;
  95     if (symbol == "operator") {
  96       // unsupported
  97       getline(in, dummy);  // skip
  98       continue;
  99     }
 100     encode_some_html_entities(symbol);
 101 //?     cerr << symbol << '\n';
 102     if (info.find(symbol) != info.end()) {
 103       info[symbol].line_num = -1;
 104       info[symbol].filename.clear();
 105     }
 106     else {
 107       in >> dummy;
 108       in >> info[symbol].line_num;
 109       in >> info[symbol].filename;
 110     }
 111     getline(in, dummy);  // skip rest of line
 112 //?     cerr << symbol << ": " << info[symbol].filename << ':' << info[symbol].line_num << '\n';
 113   }
 114   in.close();
 115 }
 116
 117 void replace_tags_in_file(const string& filename, const map<string, syminfo>& info) {
 118 //?   cerr << info.size() << " symbols\n";
 119   ifstream in(filename.c_str());
 120   ofstream out((filename+".out").c_str());
 121   while (has_data(in)) {
 122     // send lines that don't start with '<span' straight through
 123     string line;
 124     getline(in, line);
 125     if (!starts_with(line, "<span ")) {
 126       out << line << '\n';
 127     }
 128     else {
 129       static int span_size = string("</span>").size();
 130       int skip_first_span = line.find("</span>") + span_size;
 131       out << line.substr(0, skip_first_span);
 132       istringstream in2(line.substr(skip_first_span));
 133       in2 >> std::noskipws;
 134       // only in .subx files, refuse to linkify the first word on a line
 135       bool at_start_of_line = ends_with(filename, ".subx.html");
 136 //?       cerr << filename << ": " << at_start_of_line << '\n';
 137       while (has_data(in2)) {
 138         if (isspace(in2.peek())) {
 139 //?           cerr << "space\n";
 140           char c;  in2 >> c;
 141           out << c;
 142           at_start_of_line = false;
 143         }
 144         // within a line, send straight through all characters inside '<..>'
 145         else if (in2.peek() == '<') {
 146 //?           cerr << "tag\n";
 147           char c = '\0';
 148           while (in2 >> c) {
 149 //?             cerr << "span: " << c << '\n';
 150             out << c;
 151             if (c == '>') break;
 152           }
 153           // don't include initial tag when computing 'at_start_of_line'
 154 //?           cerr << "end tag\n";
 155         }
 156         else {
 157           // send straight through all characters inside strings (handling escapes)
 158           char c = in2.get();
 159           if (c == '"') {
 160 //?             cerr << "string\n";
 161             out << c;
 162             while (in2 >> c) {
 163               out << c;
 164               if (c == '\\') {
 165                 in2 >> c;  out << c;
 166               }
 167               else if (c == '"') {
 168                 break;
 169               }
 170             }
 171             at_start_of_line = false;
 172           }
 173           else if (c == '\'') {
 174 //?             cerr << "character\n";
 175             out << c;
 176             while (in2 >> c) {
 177               out << c;
 178               if (c == '\\') {
 179                 in2 >> c;  out << c;
 180               }
 181               else if (c == '\'') {
 182                 break;
 183               }
 184             }
 185             at_start_of_line = false;
 186           }
 187           // send straight through any characters after '#' (comments)
 188           else if (c == '#') {
 189 //?             cerr << "comment\n";
 190             out << c;
 191             while (in2 >> c) out << c;
 192             at_start_of_line = false;
 193           }
 194           // send straight through any characters after '//' (comments)
 195           else if (c == '/' && in2.peek() == '/') {
 196 //?             cerr << "comment\n";
 197             out << c;
 198             while (in2 >> c) out << c;
 199             at_start_of_line = false;
 200           }
 201           // send through open parens at start of line
 202           else if (c == '(') {
 203             out << c;
 204             at_start_of_line = false;
 205           }
 206           else if (c == ')') {
 207             out << c;
 208             at_start_of_line = false;
 209           }
 210           else {
 211 //?             cerr << "rest\n";
 212             if (c == ',' || c == ':') {
 213               out << c;
 214               at_start_of_line = false;
 215               continue;
 216             }
 217             ostringstream out2;
 218             out2 << c;
 219             while (in2 >> c) {
 220               if (isspace(c) || c == '<' || c == '"' || c == '\'' || c == '/' || c == ',' || c == ':' || c == '(' || c == ')') {  // keep sync'd with other clauses above
 221                 in2.putback(c);
 222                 break;
 223               }
 224               out2 << c;
 225             }
 226             string symbol = out2.str();
 227             if (symbol == "equal" || symbol == "index" || symbol == "put-index" || symbol == "length") {
 228 //?               cerr << "  blacklisted\n";
 229               out << symbol;
 230             }
 231             else if (info.find(symbol) == info.end()) {
 232 //?               cerr << "  no info\n";
 233               out << symbol;
 234             }
 235             else {
 236               const syminfo& s = info.find(symbol)->second;
 237               if (s.filename.empty()) {
 238 //?                 cerr << "  empty info\n";
 239                 out << symbol;
 240               }
 241               else {
 242                 if (at_start_of_line) {
 243 //?                   cerr << "  at start of line; refusing to linkify " << symbol << "\n";
 244                   out << symbol;
 245                 }
 246                 else {
 247 //?                   cerr << "  link\n";
 248                   out << "<a href='" << s.filename << ".html#L" << s.line_num << "'>" << symbol << "</a>";
 249                 }
 250               }
 251             }
 252           }  // end rest
 253         }
 254       }  // done parsing line
 255       out << '\n';
 256     }
 257   }
 258   in.close();  out.close();
 259 }
 260
 261 int main(int argc, const char* argv[]) {
 262   map<string, syminfo> info;
 263   read_tags(argv[1], info);
 264   for (int i = 2;  i < argc;  ++i)
 265     replace_tags_in_file(argv[i], info);
 266   return 0;
 267 }