pli/tags/generic.py

   1 #=======================================================================
   2
   3 __version__ = '''0.3.07'''
   4 __sub_version__ = '''20070720011023'''
   5 __copyright__ = '''(c) Alex A. Naanou 2007'''
   6
   7
   8 #-----------------------------------------------------------------------
   9
  10
  11
  12 #-----------------------------------------------------------------------
  13 #
  14 # XXX will need to make this adaptable to OOBTree (string keys)...
  15 #     ...most likely will concern adding an overlay over the store.
  16 # XXX try to make this linear.... (currently it appears to eat time at
  17 #     non-linear, possibly quadratic rates with the increase of data-sets)
  18 #     ...mostly, this concerns the tagging; selecting appears to be
  19 #     less affected...
  20 #
  21 #     does not appear to be the algorithm (considering how stupid it is)...
  22 #     the bottleneck appears to be the Python set (and possibly the
  23 #     memory manager)...
  24 # XXX this is essentially a relation management... thus, it might be
  25 #     good to seporate the "relation" and "tagging" specifics into two
  26 #     modules...
  27 #
  28 #
  29 # TODO add an OO extension... (replace the current tag.py)
  30 #
  31 #
  32 #
  33 #-----------------------------------------------------------------------
  34 __doc__ = '''\
  35 This module implements a basic object tagging engine.
  36
  37 This involves tag manipulation (tagging and untagging) and tag based
  38 searches.
  39
  40 In this system there is almost no distinction between the tag and the
  41 tagged object, other than that they are tagged by two different system
  42 tags: "tag" and "object". Both are stored in the same store and treated
  43 alike. There are also no restrictions to the format of either tag
  44 or the tagged object, though it is recommended for the tags to be
  45 str/unicode objects, so as to harness some optimisations within Python
  46 and supporting libraries.
  47
  48 There are also basic structural consistency verification and restoration
  49 routines implemented here.
  50
  51
  52 The Tag Store
  53 -------------
  54
  55 A dict-compatible object used to store objects and tags.
  56
  57 Semantics:
  58         Each value stores the tags/objects related to it's key.
  59
  60         When the object is tagged, the whole chain (including the object)
  61         is treated as a set of related tags. The tag is recorded as a key
  62         and the rest are recorded in a set as a value to that key. This is
  63         done for each tag in the given set.
  64
  65 Results:
  66         + trivial and fast search.
  67         + trivial, though not the fastest addition, and essentially no
  68           need for balancing (unless the store is tampered with manually).
  69         - redundant linkage within the tag store.
  70
  71
  72 NOTE: there is no distinction between tagged abjects and tags other
  73       than the two special tags "tag" and "object". They are both
  74       treated the same and stored in one structure.
  75
  76 Structure:
  77         { <tag>: set([<tag>, ...]), ...}
  78
  79
  80
  81 Notes
  82 -----
  83
  84 It is expected that the number of tags will grow far slower than the
  85 number of objects (after stabilizing the objects in a live system
  86 will exhibit linear growth, while tags will almost plateau at some point).
  87
  88 In this approach, the number of objects will be extremely large.
  89
  90 The two sub-groups (tags and objects) have slight differences. Tags
  91 tend to be highly interlinked while objects rarely exhibit linkage
  92 with each other.
  93
  94 Most searches will be tag oriented, possibly with a final filtration
  95 by tag or object.
  96
  97
  98
  99 '''
 100
 101 #-----------------------------------------------------------------------
 102 # store-level functions...
 103 #----------------------------------------------------istagsconsistent---
 104 # a store is consistent if:
 105 # - all tags in relations are present in store keys.
 106 # - if no orphan tags are allowed (???) each tag in keys MUST also be
 107 #   present in relations (related to).
 108 #
 109 # algorithm:
 110 #       union of all relations must be a subset of the union of all keys.
 111 #
 112 # NOTE: use strict equality to check for orphans...
 113 #
 114 # XXX should this ignore orphaned tags???
 115 def istagsconsistent(tagdb):
 116         '''
 117         '''
 118         rel = set()
 119         for r in tagdb.values():
 120                 rel.update(r)
 121         # XXX ignore orphans... (to check for them use strict equality)
 122         if rel.issubset(set(tagdb.keys())):
 123                 return True
 124         return False
 125
 126
 127 #---------------------------------------------------------itertaggaps---
 128 # XXX should this ignore orphaned tags???
 129 def itertaggaps(tagdb):
 130         '''
 131         find store inconsistencies and return the conflicting keys and relations.
 132
 133         this can not detect the folowing:
 134                 - missing orphaned keys (no data).
 135                 - interlinking between missing keys (no data).
 136                 - inconsistencies in relations (no way to destinguish this from
 137                   good data).
 138         '''
 139         keys = set(tagdb.keys())
 140         for tag, rel in tagdb.items():
 141                 if not keys.issuperset(rel):
 142                         yield tag, rel.difference(keys)
 143
 144
 145 #---------------------------------------------------------filltaggaps---
 146 # XXX should these two be split??
 147 # XXX might be good make this an interactive generator so as to have
 148 #     more control over what is fixed and how...
 149 # XXX should this restore the tag to self??
 150 # XXX might be good to make this explicitly depend on itertaggaps
 151 #    (through arguments)...
 152 def filltaggaps(tagdb):
 153         '''
 154         fix inconsistencies using the data returned by itertaggaps.
 155
 156         NOTE: this will restore the data that can be detected and restored
 157               only (no domain semantic checks are made at this level).
 158         NOTE: this is maximalistic. will fill the holes rather than cut off
 159               the excess.
 160         '''
 161         # NOTE: this is split in two so as to not iterate and modify the
 162         #       store at the same time...
 163         tdb_diff = {}
 164         # bild the diff correcting the errors...
 165         for key, dif in itertaggaps(tagdb):
 166                 for k in dif:
 167                         if k in tdb_diff:
 168                                 tdb_diff[k].update((key,))
 169                         else:
 170                                 tdb_diff[k] = set((key,))
 171         # apply the diff created above...
 172         for k, rel in tdb_diff.items():
 173                 # add a link to self (XXX this should be in _iter_store_gaps)
 174                 rel.update((k,))
 175                 if k in tagdb:
 176                         tagdb[k].update(rel)
 177                 else:
 178                         tagdb[k] = rel
 179         # return the diff...
 180         return tdb_diff
 181
 182
 183
 184 #-----------------------------------------------------------------------
 185 # low-level "naive" functions...
 186 #----------------------------------------------------------------_tag---
 187 # XXX what should this return??
 188 # XXX shows signs of exponential time increase on very large sets of
 189 #     data... need to revise.
 190 def _tag(tagdb, obj, *tags):
 191         '''
 192         add the tags to store.
 193         '''
 194         tt = set((obj,) + tags)
 195
 196         for t in tt:
 197                 if t in tagdb:
 198 ##                      ##!!! this appears to be a MAJOR bottleneck and slows everything down...
 199 ##                      ##!!! most of the time is spent in union (at least this is what the profiler is showing)
 200 ##                      ##!!! ...appears to take exponentially more time...
 201 ##                      tagdb[t] = tagdb[t].union(tt)
 202                         # XXX this appears to make this about 3 orders of magnitude faster @ ~1M objects... odd!
 203                         tagdb[t].update(tt)
 204                 else:
 205                         tagdb[t] = tt.copy()
 206
 207
 208 #--------------------------------------------------------------_untag---
 209 ##!!! test !!!##
 210 # XXX what should this return??
 211 # XXX should this remove orphaned tags???
 212 def _untag(tagdb, obj, *tags):
 213         '''
 214         remove tags from store.
 215         '''
 216         tt = set((obj,) + tags)
 217
 218         for t in tt:
 219                 if t not in tagdb:
 220                         # ignore invalid tags... (XXX should we complain here?)
 221                         continue
 222                 # remove the reqired tags...
 223                 tagdb[t].difference_update(tt)
 224                 # remove tag if it has no relations... (XXX)
 225                 if len(tagdb[t]) == 0:
 226                         del tagdb[t]
 227
 228
 229
 230 #-----------------------------------------------------------------------
 231 # user interface functions...
 232 #-----------------------------------------------------------------tag---
 233 # XXX what should this return??
 234 def tag(tagdb, obj, *tags):
 235         '''
 236         tag an object...
 237
 238         this maintains two special tags:
 239                 tag             : tags all the tags.
 240                 object  : tags all the objects.
 241
 242         NOTE: the tags "objects" and "tag" are meant for searches.
 243         NOTE: neither the "object" nor the "tag" tags are user modifiable.
 244         '''
 245         # do special tags...
 246         # can't manually use the tag and object tags...
 247         if 'tag' in tags or 'object' in tags or obj in ('tag', 'object'):
 248                 raise TypeError, 'can\'t use either "object" or "tag" tags manually.'
 249         # the tag tag...
 250         for t in tags:
 251                 if 'tag' not in tagdb.get(t, ()):
 252                         _tag(tagdb, t, 'tag')
 253         # the object tag...
 254         if 'object' not in tagdb.get(obj, ()):
 255                 _tag(tagdb, obj, 'object')
 256                 if 'tag' not in tagdb.get('object', ()):
 257                         _tag(tagdb, 'object', 'tag')
 258         # no do the work that the user actually requested...
 259         _tag(tagdb, obj, *tags)
 260
 261
 262 #---------------------------------------------------------------untag---
 263 # XXX what should this return??
 264 ##!!! test !!!##
 265 ##!!! what should we do here with the special tags here???
 266 def untag(tagdb, obj, *tags):
 267         '''
 268         remove the tag relation.
 269         '''
 270         # do special tags...
 271         # can't manually use the tag and object tags...
 272         if 'tag' in tags or 'object' in tags or obj in ('tag', 'object'):
 273                 raise TypeError, 'can\'t use either "object" or "tag" tags manually.'
 274         ##!!!
 275         # now remove the chain...
 276         _untag(tagdb, obj, *tags)
 277
 278
 279 #--------------------------------------------------------------select---
 280 def select(tagdb, tag, *tags):
 281         '''
 282         select a set of data using tags.
 283
 284         NOTE: this will return both tags and tagged objects. to control this
 285               use the "tag" and "object" tags...
 286         '''
 287         # a small optimisation: order the tags to cut out as mach as
 288         # possible as early as possible... (XXX check for better strategies)
 289         l = [tag] + list(tags)
 290         l.sort(lambda a, b: cmp(len(tagdb[a]), len(tagdb[b])))
 291         tag, tags = l[0], l[1:]
 292         # now do the real work...
 293         visited = set(l)
 294         res = set(tagdb[tag])
 295         # this does the folowing:
 296         # - for each tag select all the tagged objects.
 297         # - intersect the set with the tagged objects of each of the next tags.
 298         # - remove all the tags of the path (XXX not sure if this should be
 299         #   done at this stage...)
 300         for t in tags:
 301 ##              visited.update(t)
 302                 res.intersection_update(tagdb[t])
 303         return res.difference(visited)
 304
 305
 306
 307
 308 #=======================================================================
 309 #                                            vim:set ts=4 sw=4 nowrap :