ctags/parsers/bibtex.c

   1 /*
   2  *       Copyright (c) 2008, David Fishburn
   3  *       Copyright (c) 2012, Jan Larres
   4  *       Copyright (c) 2019, Mirco Schönfeld
   5  *
   6  *       This source code is released for free distribution under the terms of the
   7  *       GNU General Public License version 2 or (at your option) any later version.
   8  *
   9  *       This module contains functions for generating identifiers of entries of Bibtex language files.
  10  *
  11  *       BibTex language "reference":
  12  *               https://en.wikipedia.org/wiki/BibTeX
  13  */
  14
  15 /*
  16  *       INCLUDE FILES
  17  */
  18 #include "general.h"    /* must always come first */
  19 #include <ctype.h>      /* to define isalpha () */
  20 #include <string.h>
  21
  22 #include "debug.h"
  23 #include "entry.h"
  24 #include "keyword.h"
  25 #include "parse.h"
  26 #include "read.h"
  27 #include "routines.h"
  28 #include "vstring.h"
  29
  30 /*
  31  *       MACROS
  32  */
  33 #define isType(token,t)         (bool) ((token)->type == (t))
  34 #define isKeyword(token,k)      (bool) ((token)->keyword == (k))
  35 #define isIdentChar(c) \
  36         (isalpha (c) || isdigit (c) || (c) == '_' || (c) == '-' || (c) == '+' || (c) == ':')
  37
  38 /*
  39  *       DATA DECLARATIONS
  40  */
  41
  42 /*
  43  * Used to specify type of keyword.
  44  */
  45 enum eKeywordId {
  46         KEYWORD_article,
  47         KEYWORD_book,
  48         KEYWORD_booklet,
  49         KEYWORD_conference,
  50         KEYWORD_inbook,
  51         KEYWORD_incollection,
  52         KEYWORD_inproceedings,
  53         KEYWORD_manual,
  54         KEYWORD_mastersthesis,
  55         KEYWORD_misc,
  56         KEYWORD_phdthesis,
  57         KEYWORD_proceedings,
  58         KEYWORD_string,
  59         KEYWORD_techreport,
  60         KEYWORD_unpublished
  61 };
  62 typedef int keywordId; /* to allow KEYWORD_NONE */
  63
  64 enum eTokenType {
  65         /* 0..255 are the byte's value.  Some are named for convenience */
  66         TOKEN_OPEN_CURLY = '{',
  67         /* above is special types */
  68         TOKEN_UNDEFINED = 256,
  69         TOKEN_KEYWORD,
  70         TOKEN_IDENTIFIER
  71 };
  72 typedef int tokenType;
  73
  74 typedef struct sTokenInfo {
  75         tokenType               type;
  76         keywordId               keyword;
  77         vString *               string;
  78         unsigned long   lineNumber;
  79         MIOPos                  filePosition;
  80 } tokenInfo;
  81
  82 /*
  83  *      DATA DEFINITIONS
  84  */
  85
  86 static langType Lang_bib;
  87
  88 typedef enum {
  89         BIBTAG_ARTICLE,
  90         BIBTAG_BOOK,
  91         BIBTAG_BOOKLET,
  92         BIBTAG_CONFERENCE,
  93         BIBTAG_INBOOK,
  94         BIBTAG_INCOLLECTION,
  95         BIBTAG_INPROCEEDINGS,
  96         BIBTAG_MANUAL,
  97         BIBTAG_MASTERSTHESIS,
  98         BIBTAG_MISC,
  99         BIBTAG_PHDTHESIS,
 100         BIBTAG_PROCEEDINGS,
 101         BIBTAG_STRING,
 102         BIBTAG_TECHREPORT,
 103         BIBTAG_UNPUBLISHED,
 104         BIBTAG_COUNT
 105 } bibKind;
 106
 107 static kindDefinition BibKinds [] = {
 108         { true,  'a', "article",                                "article"                               },
 109         { true,  'b', "book",                                           "book"                                  },
 110         { true,  'B', "booklet",                                "booklet"                               },
 111         { true,  'c', "conference",                     "conference"            },
 112         { true,  'i', "inbook",                                 "inbook"                                },
 113         { true,  'I', "incollection",           "incollection"  },
 114         { true,  'j', "inproceedings",  "inproceedings" },
 115         { true,  'm', "manual",                                 "manual"                                },
 116         { true,  'M', "mastersthesis",  "mastersthesis" },
 117         { true,  'n', "misc",                                           "misc"                                  },
 118         { true,  'p', "phdthesis",                      "phdthesis"                     },
 119         { true,  'P', "proceedings",            "proceedings"           },
 120         { true,  's', "string",                                 "string"                                },
 121         { true,  't', "techreport",                     "techreport"            },
 122         { true,  'u', "unpublished",            "unpublished"           }
 123 };
 124
 125 static const keywordTable BibKeywordTable [] = {
 126         /* keyword                        keyword ID */
 127         { "article",        KEYWORD_article                             },
 128         { "book",               KEYWORD_book                              },
 129         { "booklet",        KEYWORD_booklet                             },
 130         { "conference",   KEYWORD_conference            },
 131         { "inbook",           KEYWORD_inbook                            },
 132         { "incollection",       KEYWORD_incollection    },
 133         { "inproceedings",KEYWORD_inproceedings },
 134         { "manual",           KEYWORD_manual                            },
 135         { "mastersthesis",KEYWORD_mastersthesis },
 136         { "misc",               KEYWORD_misc                              },
 137         { "phdthesis",    KEYWORD_phdthesis                     },
 138         { "proceedings",        KEYWORD_proceedings             },
 139         { "string",                             KEYWORD_string                          },
 140         { "techreport",   KEYWORD_techreport            },
 141         { "unpublished",        KEYWORD_unpublished             }
 142 };
 143
 144 /*
 145  *       FUNCTION DEFINITIONS
 146  */
 147
 148 static tokenInfo *newToken (void)
 149 {
 150         tokenInfo *const token = xMalloc (1, tokenInfo);
 151
 152         token->type                     = TOKEN_UNDEFINED;
 153         token->keyword          = KEYWORD_NONE;
 154         token->string           = vStringNew ();
 155         token->lineNumber   = getInputLineNumber ();
 156         token->filePosition = getInputFilePosition ();
 157
 158         return token;
 159 }
 160
 161 static void deleteToken (tokenInfo *const token)
 162 {
 163         vStringDelete (token->string);
 164         eFree (token);
 165 }
 166
 167 /*
 168  *       Tag generation functions
 169  */
 170 static void makeBibTag (tokenInfo *const token, bibKind kind)
 171 {
 172         if (BibKinds [kind].enabled)
 173         {
 174                 const char *const name = vStringValue (token->string);
 175                 tagEntryInfo e;
 176                 initTagEntry (&e, name, kind);
 177
 178                 e.lineNumber   = token->lineNumber;
 179                 e.filePosition = token->filePosition;
 180
 181                 makeTagEntry (&e);
 182         }
 183 }
 184
 185 /*
 186  *       Parsing functions
 187  */
 188
 189 /*
 190  *      Read a C identifier beginning with "firstChar" and places it into
 191  *      "name".
 192  */
 193 static void parseIdentifier (vString *const string, const int firstChar)
 194 {
 195         int c = firstChar;
 196         Assert (isIdentChar (c));
 197         do
 198         {
 199                 vStringPut (string, c);
 200                 c = getcFromInputFile ();
 201         } while (c != EOF && isIdentChar (c));
 202         if (c != EOF)
 203                 ungetcToInputFile (c);          /* unget non-identifier character */
 204 }
 205
 206 static bool readToken (tokenInfo *const token)
 207 {
 208         int c;
 209
 210         token->type                     = TOKEN_UNDEFINED;
 211         token->keyword          = KEYWORD_NONE;
 212         vStringClear (token->string);
 213
 214 getNextChar:
 215
 216         do
 217         {
 218                 c = getcFromInputFile ();
 219         }
 220         while (c == '\t' || c == ' ' || c == '\n');
 221
 222         token->lineNumber   = getInputLineNumber ();
 223         token->filePosition = getInputFilePosition ();
 224
 225         token->type = (unsigned char) c;
 226         switch (c)
 227         {
 228                 case EOF: return false;
 229
 230                 case '@':
 231                                         /*
 232                                          * All Bib entries start with an at symbol.
 233                                          * Check if the next character is an alpha character
 234                                          * else it is not a potential tex tag.
 235                                          */
 236                                         c = getcFromInputFile ();
 237                                         if (! isalpha (c))
 238                                           ungetcToInputFile (c);
 239                                         else
 240                                         {
 241                                                 vStringPut (token->string, '@');
 242                                                 parseIdentifier (token->string, c);
 243                                                 token->keyword = lookupCaseKeyword (vStringValue (token->string) + 1, Lang_bib);
 244                                                 if (isKeyword (token, KEYWORD_NONE))
 245                                                         token->type = TOKEN_IDENTIFIER;
 246                                                 else
 247                                                         token->type = TOKEN_KEYWORD;
 248                                         }
 249                                         break;
 250                 case '%':
 251                                         skipToCharacterInInputFile ('\n'); /* % are single line comments */
 252                                         goto getNextChar;
 253                                         break;
 254                 default:
 255                                         if (isIdentChar (c))
 256                                         {
 257                                                 parseIdentifier (token->string, c);
 258                                                 token->type = TOKEN_IDENTIFIER;
 259                                         }
 260                                         break;
 261         }
 262         return true;
 263 }
 264
 265 static void copyToken (tokenInfo *const dest, tokenInfo *const src)
 266 {
 267         dest->lineNumber = src->lineNumber;
 268         dest->filePosition = src->filePosition;
 269         dest->type = src->type;
 270         dest->keyword = src->keyword;
 271         vStringCopy (dest->string, src->string);
 272 }
 273
 274 /*
 275  *       Scanning functions
 276  */
 277
 278 static bool parseTag (tokenInfo *const token, bibKind kind)
 279 {
 280         tokenInfo *     const name = newToken ();
 281         vString *               currentid;
 282         bool                            eof = false;
 283
 284         currentid = vStringNew ();
 285         /*
 286          * Bib entries are of these formats:
 287          *   @article{identifier,
 288          *   author="John Doe"}
 289          *
 290          * When a keyword is found, loop through all words up to
 291          * a comma brace for the tag name.
 292          *
 293          */
 294         if (isType (token, TOKEN_KEYWORD))
 295         {
 296                 copyToken (name, token);
 297                 if (!readToken (token))
 298                 {
 299                         eof = true;
 300                         goto out;
 301                 }
 302         }
 303
 304         if (isType (token, TOKEN_OPEN_CURLY))
 305         {
 306                 if (!readToken (token))
 307                 {
 308                         eof = true;
 309                         goto out;
 310                 }
 311                 if (isType (token, TOKEN_IDENTIFIER)){
 312                         vStringCat (currentid, token->string);
 313                         vStringStripTrailing (currentid);
 314                         if (vStringLength (currentid) > 0)
 315                         {
 316                                 vStringCopy (name->string, currentid);
 317                                 makeBibTag (name, kind);
 318                         }
 319                 }
 320                 else
 321                 { // should find an identifier for bib item at first place
 322                         eof = true;
 323                         goto out;
 324                 }
 325         }
 326
 327  out:
 328         deleteToken (name);
 329         vStringDelete (currentid);
 330         return eof;
 331 }
 332
 333 static void parseBibFile (tokenInfo *const token)
 334 {
 335         bool eof = false;
 336
 337         do
 338         {
 339                 if (!readToken (token))
 340                         break;
 341
 342                 if (isType (token, TOKEN_KEYWORD))
 343                 {
 344                         switch (token->keyword)
 345                         {
 346                                 case KEYWORD_article:
 347                                         eof = parseTag (token, BIBTAG_ARTICLE);
 348                                         break;
 349                                 case KEYWORD_book:
 350                                         eof = parseTag (token, BIBTAG_BOOK);
 351                                         break;
 352                                 case KEYWORD_booklet:
 353                                         eof = parseTag (token, BIBTAG_BOOKLET);
 354                                         break;
 355                                 case KEYWORD_conference:
 356                                         eof = parseTag (token, BIBTAG_CONFERENCE);
 357                                         break;
 358                                 case KEYWORD_inbook:
 359                                         eof = parseTag (token, BIBTAG_INBOOK);
 360                                         break;
 361                                 case KEYWORD_incollection:
 362                                         eof = parseTag (token, BIBTAG_INCOLLECTION);
 363                                         break;
 364                                 case KEYWORD_inproceedings:
 365                                         eof = parseTag (token, BIBTAG_INPROCEEDINGS);
 366                                         break;
 367                                 case KEYWORD_manual:
 368                                         eof = parseTag (token, BIBTAG_MANUAL);
 369                                         break;
 370                                 case KEYWORD_mastersthesis:
 371                                         eof = parseTag (token, BIBTAG_MASTERSTHESIS);
 372                                         break;
 373                                 case KEYWORD_misc:
 374                                         eof = parseTag (token, BIBTAG_MISC);
 375                                         break;
 376                                 case KEYWORD_phdthesis:
 377                                         eof = parseTag (token, BIBTAG_PHDTHESIS);
 378                                         break;
 379                                 case KEYWORD_proceedings:
 380                                         eof = parseTag (token, BIBTAG_PROCEEDINGS);
 381                                         break;
 382                                 case KEYWORD_string:
 383                                         eof = parseTag (token, BIBTAG_STRING);
 384                                         break;
 385                                 case KEYWORD_techreport:
 386                                         eof = parseTag (token, BIBTAG_TECHREPORT);
 387                                         break;
 388                                 case KEYWORD_unpublished:
 389                                         eof = parseTag (token, BIBTAG_UNPUBLISHED);
 390                                         break;
 391                                 default:
 392                                         break;
 393                         }
 394                 }
 395                 if (eof)
 396                         break;
 397         } while (true);
 398 }
 399
 400 static void initialize (const langType language)
 401 {
 402         Lang_bib = language;
 403 }
 404
 405 static void findBibTags (void)
 406 {
 407         tokenInfo *const token = newToken ();
 408
 409         parseBibFile (token);
 410
 411         deleteToken (token);
 412 }
 413
 414 /* Create parser definition structure */
 415 extern parserDefinition* BibtexParser (void)
 416 {
 417         Assert (ARRAY_SIZE (BibKinds) == BIBTAG_COUNT);
 418         static const char *const extensions [] = { "bib", NULL };
 419         parserDefinition *const def = parserNew ("BibTeX");
 420         def->extensions = extensions;
 421         /*
 422          * New definitions for parsing instead of regex
 423          */
 424         def->kindTable          = BibKinds;
 425         def->kindCount          = ARRAY_SIZE (BibKinds);
 426         def->parser                             = findBibTags;
 427         def->initialize         = initialize;
 428         def->keywordTable       = BibKeywordTable;
 429         def->keywordCount       = ARRAY_SIZE (BibKeywordTable);
 430         return def;
 431 }