ctags/parsers/bibtex.c

   1 /*
   2  *       Copyright (c) 2008, David Fishburn
   3  *       Copyright (c) 2012, Jan Larres
   4  *       Copyright (c) 2019, Mirco Schönfeld
   5  *
   6  *       This source code is released for free distribution under the terms of the
   7  *       GNU General Public License version 2 or (at your option) any later version.
   8  *
   9  *       This module contains functions for generating identifiers of entries of Bibtex language files.
  10  *
  11  *       BibTex language "reference":
  12  *               https://en.wikipedia.org/wiki/BibTeX
  13  */
  14
  15 /*
  16  *       INCLUDE FILES
  17  */
  18 #include "general.h"    /* must always come first */
  19 #include <ctype.h>      /* to define isalpha () */
  20 #include <string.h>
  21
  22 #include "debug.h"
  23 #include "bibtex.h"
  24 #include "entry.h"
  25 #include "keyword.h"
  26 #include "parse.h"
  27 #include "read.h"
  28 #include "routines.h"
  29 #include "vstring.h"
  30
  31 /*
  32  *       MACROS
  33  */
  34 #define isType(token,t)         (bool) ((token)->type == (t))
  35 #define isKeyword(token,k)      (bool) ((token)->keyword == (k))
  36 #define isIdentChar(c) \
  37         (isalpha (c) || isdigit (c) || (c) == '_' || (c) == '-' || (c) == '+' || (c) == ':' || (c) == '.' || (c) == '/')
  38
  39 /*
  40  *       DATA DECLARATIONS
  41  */
  42
  43 /*
  44  * Used to specify type of keyword.
  45  */
  46 enum eKeywordId {
  47         KEYWORD_article,
  48         KEYWORD_book,
  49         KEYWORD_booklet,
  50         KEYWORD_conference,
  51         KEYWORD_inbook,
  52         KEYWORD_incollection,
  53         KEYWORD_inproceedings,
  54         KEYWORD_manual,
  55         KEYWORD_mastersthesis,
  56         KEYWORD_misc,
  57         KEYWORD_phdthesis,
  58         KEYWORD_proceedings,
  59         KEYWORD_string,
  60         KEYWORD_techreport,
  61         KEYWORD_unpublished
  62 };
  63 typedef int keywordId; /* to allow KEYWORD_NONE */
  64
  65 enum eTokenType {
  66         /* 0..255 are the byte's value.  Some are named for convenience */
  67         TOKEN_OPEN_CURLY = '{',
  68         /* above is special types */
  69         TOKEN_UNDEFINED = 256,
  70         TOKEN_KEYWORD,
  71         TOKEN_IDENTIFIER
  72 };
  73 typedef int tokenType;
  74
  75 typedef struct sTokenInfo {
  76         tokenType               type;
  77         keywordId               keyword;
  78         vString *               string;
  79         unsigned long   lineNumber;
  80         MIOPos                  filePosition;
  81 } tokenInfo;
  82
  83 /*
  84  *      DATA DEFINITIONS
  85  */
  86
  87 static langType Lang_bib;
  88
  89 typedef enum {
  90         BIBTAG_ARTICLE,
  91         BIBTAG_BOOK,
  92         BIBTAG_BOOKLET,
  93         BIBTAG_CONFERENCE,
  94         BIBTAG_INBOOK,
  95         BIBTAG_INCOLLECTION,
  96         BIBTAG_INPROCEEDINGS,
  97         BIBTAG_MANUAL,
  98         BIBTAG_MASTERSTHESIS,
  99         BIBTAG_MISC,
 100         BIBTAG_PHDTHESIS,
 101         BIBTAG_PROCEEDINGS,
 102         BIBTAG_STRING,
 103         BIBTAG_TECHREPORT,
 104         BIBTAG_UNPUBLISHED,
 105         BIBTAG_COUNT
 106 } bibKind;
 107
 108 static kindDefinition BibKinds [] = {
 109         { true,  'a', "article",                                "article"                               },
 110         { true,  'b', "book",                                           "book"                                  },
 111         { true,  'B', "booklet",                                "booklet"                               },
 112         { true,  'c', "conference",                     "conference"            },
 113         { true,  'i', "inbook",                                 "inbook"                                },
 114         { true,  'I', "incollection",           "incollection"  },
 115         { true,  'j', "inproceedings",  "inproceedings" },
 116         { true,  'm', "manual",                                 "manual"                                },
 117         { true,  'M', "mastersthesis",  "mastersthesis" },
 118         { true,  'n', "misc",                                           "misc"                                  },
 119         { true,  'p', "phdthesis",                      "phdthesis"                     },
 120         { true,  'P', "proceedings",            "proceedings"           },
 121         { true,  's', "string",                                 "string"                                },
 122         { true,  't', "techreport",                     "techreport"            },
 123         { true,  'u', "unpublished",            "unpublished"           }
 124 };
 125
 126 static const keywordTable BibKeywordTable [] = {
 127         /* keyword                        keyword ID */
 128         { "article",        KEYWORD_article                             },
 129         { "book",               KEYWORD_book                              },
 130         { "booklet",        KEYWORD_booklet                             },
 131         { "conference",   KEYWORD_conference            },
 132         { "inbook",           KEYWORD_inbook                            },
 133         { "incollection",       KEYWORD_incollection    },
 134         { "inproceedings",KEYWORD_inproceedings },
 135         { "manual",           KEYWORD_manual                            },
 136         { "mastersthesis",KEYWORD_mastersthesis },
 137         { "misc",               KEYWORD_misc                              },
 138         { "phdthesis",    KEYWORD_phdthesis                     },
 139         { "proceedings",        KEYWORD_proceedings             },
 140         { "string",                             KEYWORD_string                          },
 141         { "techreport",   KEYWORD_techreport            },
 142         { "unpublished",        KEYWORD_unpublished             }
 143 };
 144
 145 /*
 146  *       FUNCTION DEFINITIONS
 147  */
 148
 149 static tokenInfo *newToken (void)
 150 {
 151         tokenInfo *const token = xMalloc (1, tokenInfo);
 152
 153         token->type                     = TOKEN_UNDEFINED;
 154         token->keyword          = KEYWORD_NONE;
 155         token->string           = vStringNew ();
 156         token->lineNumber   = getInputLineNumber ();
 157         token->filePosition = getInputFilePosition ();
 158
 159         return token;
 160 }
 161
 162 static void deleteToken (tokenInfo *const token)
 163 {
 164         vStringDelete (token->string);
 165         eFree (token);
 166 }
 167
 168 /*
 169  *       Tag generation functions
 170  */
 171 static void makeBibTag (tokenInfo *const token, bibKind kind)
 172 {
 173         const char *const name = vStringValue (token->string);
 174         tagEntryInfo e;
 175         initTagEntry (&e, name, kind);
 176
 177         updateTagLine (&e, token->lineNumber, token->filePosition);
 178
 179         makeTagEntry (&e);
 180 }
 181
 182 /*
 183  *       Parsing functions
 184  */
 185
 186 /*
 187  *      Read a C identifier beginning with "firstChar" and places it into
 188  *      "name".
 189  */
 190 static void parseIdentifier (vString *const string, const int firstChar)
 191 {
 192         int c = firstChar;
 193         Assert (isIdentChar (c));
 194         do
 195         {
 196                 vStringPut (string, c);
 197                 c = getcFromInputFile ();
 198         } while (c != EOF && isIdentChar (c));
 199         if (c != EOF)
 200                 ungetcToInputFile (c);          /* unget non-identifier character */
 201 }
 202
 203 static bool readToken (tokenInfo *const token)
 204 {
 205         int c;
 206
 207         token->type                     = TOKEN_UNDEFINED;
 208         token->keyword          = KEYWORD_NONE;
 209         vStringClear (token->string);
 210
 211 getNextChar:
 212
 213         do
 214         {
 215                 c = getcFromInputFile ();
 216         }
 217         while (c == '\t' || c == ' ' || c == '\n');
 218
 219         token->lineNumber   = getInputLineNumber ();
 220         token->filePosition = getInputFilePosition ();
 221
 222         token->type = (unsigned char) c;
 223         switch (c)
 224         {
 225                 case EOF: return false;
 226
 227                 case '@':
 228                                         /*
 229                                          * All Bib entries start with an at symbol.
 230                                          * Check if the next character is an alpha character
 231                                          * else it is not a potential tex tag.
 232                                          */
 233                                         c = getcFromInputFile ();
 234                                         if (! isalpha (c))
 235                                           ungetcToInputFile (c);
 236                                         else
 237                                         {
 238                                                 vStringPut (token->string, '@');
 239                                                 parseIdentifier (token->string, c);
 240                                                 token->keyword = lookupCaseKeyword (vStringValue (token->string) + 1, Lang_bib);
 241                                                 if (isKeyword (token, KEYWORD_NONE))
 242                                                         token->type = TOKEN_IDENTIFIER;
 243                                                 else
 244                                                         token->type = TOKEN_KEYWORD;
 245                                         }
 246                                         break;
 247                 case '%':
 248                                         skipToCharacterInInputFile ('\n'); /* % are single line comments */
 249                                         goto getNextChar;
 250                                         break;
 251                 default:
 252                                         if (isIdentChar (c))
 253                                         {
 254                                                 parseIdentifier (token->string, c);
 255                                                 token->type = TOKEN_IDENTIFIER;
 256                                         }
 257                                         break;
 258         }
 259         return true;
 260 }
 261
 262 static void copyToken (tokenInfo *const dest, tokenInfo *const src)
 263 {
 264         dest->lineNumber = src->lineNumber;
 265         dest->filePosition = src->filePosition;
 266         dest->type = src->type;
 267         dest->keyword = src->keyword;
 268         vStringCopy (dest->string, src->string);
 269 }
 270
 271 /*
 272  *       Scanning functions
 273  */
 274
 275 static bool parseTag (tokenInfo *const token, bool foreignKeyword, int kind)
 276 {
 277         tokenInfo *     const name = newToken ();
 278         vString *               currentid;
 279         bool                            eof = false;
 280
 281         currentid = vStringNew ();
 282         /*
 283          * Bib entries are of these formats:
 284          *   @article{identifier,
 285          *   author="John Doe"}
 286          *
 287          * When a keyword is found, loop through all words up to
 288          * a comma brace for the tag name.
 289          *
 290          */
 291         if (isType (token, TOKEN_KEYWORD) || foreignKeyword)
 292         {
 293                 copyToken (name, token);
 294                 if (!readToken (token))
 295                 {
 296                         eof = true;
 297                         goto out;
 298                 }
 299         }
 300
 301         if (isType (token, TOKEN_OPEN_CURLY))
 302         {
 303                 if (!readToken (token))
 304                 {
 305                         eof = true;
 306                         goto out;
 307                 }
 308                 if (isType (token, TOKEN_IDENTIFIER)){
 309                         vStringCat (currentid, token->string);
 310                         vStringStripTrailing (currentid);
 311                         if (vStringLength (currentid) > 0)
 312                         {
 313                                 vStringCopy (name->string, currentid);
 314                                 makeBibTag (name, kind);
 315                         }
 316                 }
 317                 else
 318                 { // should find an identifier for bib item at first place
 319                         eof = true;
 320                         goto out;
 321                 }
 322         }
 323
 324  out:
 325         deleteToken (name);
 326         vStringDelete (currentid);
 327         return eof;
 328 }
 329
 330 static bool mayParseTokenInSubparser (tokenInfo *const token)
 331 {
 332         bool eof = false;
 333         subparser *sub;
 334
 335         if (*vStringValue (token->string) != '@')
 336                 return eof;
 337
 338         foreachSubparser (sub, true)
 339         {
 340                 bibTexSubparser *bibsub = (bibTexSubparser *)sub;
 341                 if (bibsub->isKeywordForTagging)
 342                 {
 343                         int kind;
 344                         enterSubparser (sub);
 345                         kind = bibsub->isKeywordForTagging (bibsub,
 346                                                                                                 vStringValue (token->string) + 1);
 347                         if (kind != KIND_GHOST_INDEX)
 348                                 eof = parseTag (token, true, kind);
 349                         leaveSubparser ();
 350                         if (kind != KIND_GHOST_INDEX)
 351                                 break;
 352                 }
 353         }
 354
 355         return eof;
 356 }
 357
 358 static void parseBibFile (tokenInfo *const token)
 359 {
 360         bool eof = false;
 361
 362         do
 363         {
 364                 if (!readToken (token))
 365                         break;
 366
 367                 bibKind kind = KIND_GHOST_INDEX;;
 368
 369                 if (isType (token, TOKEN_KEYWORD))
 370                 {
 371                         switch (token->keyword)
 372                         {
 373                                 case KEYWORD_article:
 374                                         kind = BIBTAG_ARTICLE;
 375                                         break;
 376                                 case KEYWORD_book:
 377                                         kind = BIBTAG_BOOK;
 378                                         break;
 379                                 case KEYWORD_booklet:
 380                                         kind = BIBTAG_BOOKLET;
 381                                         break;
 382                                 case KEYWORD_conference:
 383                                         kind = BIBTAG_CONFERENCE;
 384                                         break;
 385                                 case KEYWORD_inbook:
 386                                         kind = BIBTAG_INBOOK;
 387                                         break;
 388                                 case KEYWORD_incollection:
 389                                         kind = BIBTAG_INCOLLECTION;
 390                                         break;
 391                                 case KEYWORD_inproceedings:
 392                                         kind = BIBTAG_INPROCEEDINGS;
 393                                         break;
 394                                 case KEYWORD_manual:
 395                                         kind = BIBTAG_MANUAL;
 396                                         break;
 397                                 case KEYWORD_mastersthesis:
 398                                         kind = BIBTAG_MASTERSTHESIS;
 399                                         break;
 400                                 case KEYWORD_misc:
 401                                         kind = BIBTAG_MISC;
 402                                         break;
 403                                 case KEYWORD_phdthesis:
 404                                         kind = BIBTAG_PHDTHESIS;
 405                                         break;
 406                                 case KEYWORD_proceedings:
 407                                         kind = BIBTAG_PROCEEDINGS;
 408                                         break;
 409                                 case KEYWORD_string:
 410                                         kind = BIBTAG_STRING;
 411                                         break;
 412                                 case KEYWORD_techreport:
 413                                         kind = BIBTAG_TECHREPORT;
 414                                         break;
 415                                 case KEYWORD_unpublished:
 416                                         kind = BIBTAG_UNPUBLISHED;
 417                                         break;
 418                         }
 419                 }
 420
 421                 if (kind != KIND_GHOST_INDEX)
 422                         eof = parseTag (token, false, kind);
 423                 else
 424                         eof = mayParseTokenInSubparser(token);
 425
 426         } while (!eof);
 427 }
 428
 429 static void initialize (const langType language)
 430 {
 431         Lang_bib = language;
 432 }
 433
 434 static void findBibTags (void)
 435 {
 436         tokenInfo *const token = newToken ();
 437
 438         parseBibFile (token);
 439
 440         deleteToken (token);
 441 }
 442
 443 /* Create parser definition structure */
 444 extern parserDefinition* BibtexParser (void)
 445 {
 446         Assert (ARRAY_SIZE (BibKinds) == BIBTAG_COUNT);
 447         static const char *const extensions [] = { "bib", NULL };
 448         parserDefinition *const def = parserNew ("BibTeX");
 449         def->extensions = extensions;
 450         /*
 451          * New definitions for parsing instead of regex
 452          */
 453         def->kindTable          = BibKinds;
 454         def->kindCount          = ARRAY_SIZE (BibKinds);
 455         def->parser                             = findBibTags;
 456         def->initialize         = initialize;
 457         def->keywordTable       = BibKeywordTable;
 458         def->keywordCount       = ARRAY_SIZE (BibKeywordTable);
 459         return def;
 460 }