tagging release
[dasher.git] / Src / DasherCore / Alphabet / AlphabetMap.h
blob34591eb924cdb10568e064c3c6adcf052f3a57fc
1 // AlphabetMap.h
2 //
3 /////////////////////////////////////////////////////////////////////////////
4 //
5 // Copyright (c) 2002 Iain Murray
6 //
7 /////////////////////////////////////////////////////////////////////////////
10 #ifndef __AlphabetMap_h__
11 #define __AlphabetMap_h__
13 #ifndef DASHER_WIN32
14 #include <sys/types.h>
15 #endif
17 #include "../DasherTypes.h"
19 #include <vector>
20 #include <string>
22 namespace Dasher {
23 class alphabet_map;
26 /// \ingroup Alphabet
27 /// \{
29 /// If I were just using GCC, which comes with the CGI "STL" implementation, I would
30 /// use hash_map (which isn't part of the ANSI/ISO standard C++ STL, but hey it's nice).
31 /// Using a plain map is just too slow for training on large files (or it is with certain
32 /// STL implementations). I'm sure training could be made much faster still, but that's
33 /// another matter...
34 ///
35 /// While I could (and probably should) get a hash_map for VC++ from
36 /// http://www.stlport.org I thought it would be nicer if people didn't have
37 /// to download extra stuff and then have to get it working alongside the STL
38 /// with VC++, especially for just one small part of Dasher.
39 ///
40 /// The result is this:
41 /// ***************************************************
42 /// very much thrown together to get Dasher out ASAP.
43 /// ***************************************************
44 /// It is deliberately not like an STL container.
45 /// However, as it has a tiny interface, it should still be easy to replace.
46 /// Sorry if this seems really unprofressional.
47 ///
48 /// Replacing it might be a good idea. On the other hand it could be customised
49 /// to the needs of the alphabet, so that it works faster. For example,
50 /// currently if I have a string "asdf", it might be that "a" is checked
51 /// then "as" is checked then "asd" is checked. I shouldn't need to keep
52 /// rehashing the leading characters. I plan to fix that here. Doing so with
53 /// a standard hash_map would be hard.
54 ///
55 /// Usage:
56 /// alphabet_map MyMap(NumberOfEntriesWeExpect); // Can omit NumberOfEntriesWeExpect
57 /// MyMap.add("asdf", 15);
58 /// symbol i = MyMap.get("asdf") // i=15
59 /// symbol j = MyMap.get("fdsa") // j=0
60 ///
61 /// You can't remove items once they are added as Dasher has no need for that.
62 ///
63 /// IAM 08/2002
64 class Dasher::alphabet_map {
66 public:
67 alphabet_map(unsigned int InitialTableSize = 255);
68 void Add(const std::string & Key, symbol Value);
69 symbol Get(const std::string & Key, bool * KeyIsPrefix) const;
71 private:
72 class Entry {
73 public:
74 Entry(std::string Key, symbol Symbol, Entry * Next)
75 : Key(Key), KeyIsPrefix(false), Symbol(Symbol), Next(Next) {
76 } std::string Key;
77 bool KeyIsPrefix;
78 symbol Symbol;
79 Entry *Next;
82 void RecursiveAdd(const std::string & Key, symbol Value, bool PrefixFlag);
84 // A standard hash -- could try and research something specific.
85 inline unsigned int Hash(const std::string & Input) const {
86 unsigned int Result = 0;
88 typedef std::string::const_iterator CI;
89 CI Cur = Input.begin();
90 CI end = Input.end();
92 while(Cur != end)
93 Result = (Result << 1) ^ *Cur++;
94 Result %= HashTable.size();
96 return Result;
98 if (Input.size()==1) // Speedup for ASCII text
99 return Input[0];
101 for (int i=0; i<Input.size(); i++)
102 Result = (Result<<1)^Input[i];
104 return Result%HashTable.size();
106 } std::vector < Entry > Entries;
107 std::vector < Entry * >HashTable;
108 const symbol Undefined;
110 /// \}
112 #endif /* #ifndef __AlphabetMap_h__ */