3 /////////////////////////////////////////////////////////////////////////////
5 // Copyright (c) 2002 Iain Murray
7 /////////////////////////////////////////////////////////////////////////////
10 #ifndef __AlphabetMap_h__
11 #define __AlphabetMap_h__
14 #include <sys/types.h>
17 #include "../DasherTypes.h"
29 /// If I were just using GCC, which comes with the CGI "STL" implementation, I would
30 /// use hash_map (which isn't part of the ANSI/ISO standard C++ STL, but hey it's nice).
31 /// Using a plain map is just too slow for training on large files (or it is with certain
32 /// STL implementations). I'm sure training could be made much faster still, but that's
35 /// While I could (and probably should) get a hash_map for VC++ from
36 /// http://www.stlport.org I thought it would be nicer if people didn't have
37 /// to download extra stuff and then have to get it working alongside the STL
38 /// with VC++, especially for just one small part of Dasher.
40 /// The result is this:
41 /// ***************************************************
42 /// very much thrown together to get Dasher out ASAP.
43 /// ***************************************************
44 /// It is deliberately not like an STL container.
45 /// However, as it has a tiny interface, it should still be easy to replace.
46 /// Sorry if this seems really unprofressional.
48 /// Replacing it might be a good idea. On the other hand it could be customised
49 /// to the needs of the alphabet, so that it works faster. For example,
50 /// currently if I have a string "asdf", it might be that "a" is checked
51 /// then "as" is checked then "asd" is checked. I shouldn't need to keep
52 /// rehashing the leading characters. I plan to fix that here. Doing so with
53 /// a standard hash_map would be hard.
56 /// alphabet_map MyMap(NumberOfEntriesWeExpect); // Can omit NumberOfEntriesWeExpect
57 /// MyMap.add("asdf", 15);
58 /// symbol i = MyMap.get("asdf") // i=15
59 /// symbol j = MyMap.get("fdsa") // j=0
61 /// You can't remove items once they are added as Dasher has no need for that.
64 class Dasher::alphabet_map
{
67 alphabet_map(unsigned int InitialTableSize
= 255);
68 void Add(const std::string
& Key
, symbol Value
);
69 symbol
Get(const std::string
& Key
, bool * KeyIsPrefix
) const;
74 Entry(std::string Key
, symbol Symbol
, Entry
* Next
)
75 : Key(Key
), KeyIsPrefix(false), Symbol(Symbol
), Next(Next
) {
82 void RecursiveAdd(const std::string
& Key
, symbol Value
, bool PrefixFlag
);
84 // A standard hash -- could try and research something specific.
85 inline unsigned int Hash(const std::string
& Input
) const {
86 unsigned int Result
= 0;
88 typedef std::string::const_iterator CI
;
89 CI Cur
= Input
.begin();
93 Result
= (Result
<< 1) ^ *Cur
++;
94 Result
%= HashTable
.size();
98 if (Input.size()==1) // Speedup for ASCII text
101 for (int i=0; i<Input.size(); i++)
102 Result = (Result<<1)^Input[i];
104 return Result%HashTable.size();
106 } std::vector
< Entry
> Entries
;
107 std::vector
< Entry
* >HashTable
;
108 const symbol Undefined
;
112 #endif /* #ifndef __AlphabetMap_h__ */