pgweb/search/views.py

   1 from django.shortcuts import render
   2 from django.http import HttpResponseRedirect
   3 from django.views.decorators.csrf import csrf_exempt
   4 from django.conf import settings
   5
   6 from pgweb.util.decorators import cache
   7
   8 import urllib.parse
   9 import requests
  10 import psycopg2
  11
  12 from pgweb.lists.models import MailingList
  13
  14 # Conditionally import memcached library. Everything will work without
  15 # it, so we allow development installs to run without it...
  16 try:
  17     import pylibmc
  18     has_memcached = True
  19 except:
  20     has_memcached = False
  21
  22
  23 def generate_pagelinks(pagenum, totalpages, querystring):
  24     # Generate a list of links to page through a search result
  25     # We generate these in HTML from the python code because it's
  26     # simply too ugly to try to do it in the template.
  27     if totalpages < 2:
  28         return
  29
  30     if pagenum > 1:
  31         # Prev link
  32         yield '<a href="%s&p=%s">Prev</a>' % (querystring, pagenum - 1)
  33
  34     if pagenum > 10:
  35         start = pagenum - 10
  36     else:
  37         start = 1
  38
  39     for i in range(start, min(start + 20, totalpages + 1)):
  40         if i == pagenum:
  41             yield "%s" % i
  42         else:
  43             yield '<a href="%s&p=%s">%s</a>' % (querystring, i, i)
  44
  45     if pagenum != min(start + 20, totalpages):
  46         yield '<a href="%s&p=%s">Next</a>' % (querystring, pagenum + 1)
  47
  48
  49 @csrf_exempt
  50 @cache(minutes=15)
  51 def search(request):
  52     # Perform a general web search
  53     # Since this lives in a different database, we open a direct
  54     # connection with psycopg, thus bypassing everything that has to do
  55     # with django.
  56
  57     # constants that we might eventually want to make configurable
  58     hitsperpage = 20
  59
  60     if request.GET.get('m', '') == '1':
  61         searchlists = True
  62
  63         if request.GET.get('l', '') != '':
  64             try:
  65                 listid = int(request.GET['l'])
  66             except:
  67                 listid = None
  68         else:
  69             # Listid not specified. But do we have the name?
  70             if 'ln' in request.GET:
  71                 try:
  72                     ll = MailingList.objects.get(listname=request.GET['ln'])
  73                     listid = ll.id
  74                 except MailingList.DoesNotExist:
  75                     # Invalid list name just resets the default of the form,
  76                     # no need to throw an error.
  77                     listid = None
  78             else:
  79                 listid = None
  80
  81         if 'd' in request.GET:
  82             try:
  83                 dateval = int(request.GET['d'])
  84             except:
  85                 dateval = None
  86         else:
  87             dateval = None
  88
  89         if 's' in request.GET:
  90             listsort = request.GET['s']
  91             if listsort not in ('r', 'd', 'i'):
  92                 listsort = 'r'
  93         else:
  94             listsort = 'r'
  95
  96         if not dateval:
  97             dateval = 365
  98
  99         sortoptions = (
 100             {'val': 'r', 'text': 'Rank', 'selected': request.GET.get('s', '') not in ('d', 'i')},
 101             {'val': 'd', 'text': 'Date', 'selected': request.GET.get('s', '') == 'd'},
 102             {'val': 'i', 'text': 'Reverse date', 'selected': request.GET.get('s', '') == 'i'},
 103         )
 104         dateoptions = (
 105             {'val': -1, 'text': 'anytime'},
 106             {'val': 1, 'text': 'within last day'},
 107             {'val': 7, 'text': 'within last week'},
 108             {'val': 31, 'text': 'within last month'},
 109             {'val': 186, 'text': 'within last 6 months'},
 110             {'val': 365, 'text': 'within last year'},
 111         )
 112     else:
 113         searchlists = False
 114         suburl = request.GET.get('u', None)
 115         allsites = request.GET.get('a', None) == "1"
 116
 117     # Check that we actually have something to search for
 118     if request.GET.get('q', '') == '':
 119         if searchlists:
 120             return render(request, 'search/listsearch.html', {
 121                 'search_error': "No search term specified.",
 122                 'sortoptions': sortoptions,
 123                 'lists': MailingList.objects.all().order_by("group__sortkey"),
 124                 'listid': listid,
 125                 'dates': dateoptions,
 126                 'dateval': dateval,
 127             })
 128         else:
 129             return render(request, 'search/sitesearch.html', {
 130                 'search_error': "No search term specified.",
 131             })
 132     query = request.GET['q'].strip()
 133     if '\0' in query:
 134         return render(request, 'search/sitesearch.html', {
 135             'search_error': "Invalid character in search.",
 136         })
 137
 138     # Anti-stefan prevention
 139     if len(query) > 1000:
 140         return render(request, 'search/sitesearch.html', {
 141             'search_error': "Search term too long.",
 142         })
 143
 144     # Is the request being paged?
 145     try:
 146         pagenum = int(request.GET.get('p', 1))
 147     except:
 148         pagenum = 1
 149
 150     firsthit = (pagenum - 1) * hitsperpage + 1
 151
 152     if searchlists:
 153         # Lists are searched by passing the work down using a http
 154         # API. In the future, we probably want to do everything
 155         # through a http API and merge hits, but that's for later
 156         p = {
 157             'q': query.encode('utf-8'),
 158             's': listsort,
 159         }
 160         if listid:
 161             if listid < 0:
 162                 # This is a list group, we expand that on the web server
 163                 p['ln'] = ','.join([x.listname for x in MailingList.objects.filter(group=-listid)])
 164             else:
 165                 p['ln'] = MailingList.objects.get(pk=listid).listname
 166         if dateval:
 167             p['d'] = dateval
 168         urlstr = urllib.parse.urlencode(p)
 169         # If memcached is available, let's try it
 170         hits = None
 171         if has_memcached:
 172             memc = pylibmc.Client(['127.0.0.1', ], binary=True)
 173             # behavior not supported on pylibmc in squeeze:: behaviors={'tcp_nodelay':True})
 174             try:
 175                 hits = memc.get(urlstr)
 176             except Exception:
 177                 # If we had an exception, don't try to store either
 178                 memc = None
 179         if not hits:
 180             # No hits found - so try to get them from the search server
 181             try:
 182                 r = requests.post(
 183                     "{}://{}/archives-search/".format(settings.ARCHIVES_SEARCH_PLAINTEXT and 'http' or 'https', settings.ARCHIVES_SEARCH_SERVER),
 184                     urlstr,
 185                     headers={
 186                         'Content-type': 'application/x-www-form-urlencoded; charset=utf-8',
 187                     },
 188                     timeout=5,
 189                 )
 190             except requests.exceptions.Timeout:
 191                 return render(request, 'search/listsearch.html', {
 192                     'search_error': 'Timeout when talking to search server. Please try your search again later, or with a more restrictive search terms.',
 193                 })
 194             except:
 195                 return render(request, 'search/listsearch.html', {
 196                     'search_error': 'General error when talking to search server.',
 197                 })
 198             if r.status_code != 200:
 199                 memc = None
 200                 return render(request, 'search/listsearch.html', {
 201                     'search_error': 'Error talking to search server: %s' % r.reason,
 202                 })
 203             hits = r.json()
 204             if has_memcached and memc:
 205                 # Store them in memcached too! But only for 10 minutes...
 206                 # And always compress it, just because we can
 207                 memc.set(urlstr, hits, 60 * 10, 1)
 208                 memc = None
 209
 210         if isinstance(hits, dict):
 211             # This is not just a list of hits.
 212             # Right now the only supported dict result is a messageid
 213             # match, but make sure that's what it is.
 214             if hits['messageidmatch'] == 1:
 215                 return HttpResponseRedirect("/message-id/%s" % query)
 216
 217         totalhits = len(hits)
 218         querystr = "?m=1&q=%s&l=%s&d=%s&s=%s" % (
 219             urllib.parse.quote_plus(query.encode('utf-8')),
 220             listid or '',
 221             dateval,
 222             listsort
 223         )
 224
 225         return render(request, 'search/listsearch.html', {
 226             'hitcount': totalhits,
 227             'firsthit': firsthit,
 228             'lasthit': min(totalhits, firsthit + hitsperpage - 1),
 229             'query': request.GET['q'],
 230             'pagelinks': "&nbsp;".join(
 231                 generate_pagelinks(pagenum,
 232                                    totalhits // hitsperpage + 1,
 233                                    querystr)),
 234             'hits': [{
 235                 'date': h['d'],
 236                 'subject': h['s'],
 237                 'author': h['f'],
 238                 'messageid': h['m'],
 239                 'abstract': h['a'],
 240                 'rank': h['r'],
 241             } for h in hits[firsthit - 1:firsthit + hitsperpage - 1]],
 242             'sortoptions': sortoptions,
 243             'lists': MailingList.objects.all().order_by("group__sortkey"),
 244             'listid': listid,
 245             'dates': dateoptions,
 246             'dateval': dateval,
 247         })
 248
 249     else:
 250         # Website search is still done by making a regular pgsql connection
 251         # to the search server.
 252         try:
 253             conn = psycopg2.connect(settings.SEARCH_DSN)
 254             curs = conn.cursor()
 255         except:
 256             return render(request, 'search/sitesearch.html', {
 257                 'search_error': 'Could not connect to search database.'
 258             })
 259
 260         # This is kind of a hack, but... Some URLs are flagged as internal
 261         # and should as such only be included in searches that explicitly
 262         # reference the suburl that they are in.
 263         if suburl and suburl.startswith('/docs/devel'):
 264             include_internal = True
 265         else:
 266             include_internal = False
 267
 268         # perform the query for general web search
 269         try:
 270             curs.execute("SELECT * FROM site_search(%(query)s, %(firsthit)s, %(hitsperpage)s, %(allsites)s, %(suburl)s, %(internal)s)", {
 271                 'query': query,
 272                 'firsthit': firsthit - 1,
 273                 'hitsperpage': hitsperpage,
 274                 'allsites': allsites,
 275                 'suburl': suburl,
 276                 'internal': include_internal,
 277             })
 278         except psycopg2.ProgrammingError:
 279             return render(request, 'search/sitesearch.html', {
 280                 'search_error': 'Error executing search query.'
 281             })
 282
 283         hits = curs.fetchall()
 284         conn.close()
 285         totalhits = int(hits[-1][5])
 286         try:
 287             if suburl:
 288                 quoted_suburl = urllib.parse.quote_plus(suburl)
 289             else:
 290                 quoted_suburl = ''
 291         except:
 292             quoted_suburl = ''
 293         querystr = "?q=%s&a=%s&u=%s" % (
 294             urllib.parse.quote_plus(query.encode('utf-8')),
 295             allsites and "1" or "0",
 296             quoted_suburl,
 297         )
 298
 299         return render(request, 'search/sitesearch.html', {
 300             'suburl': suburl,
 301             'allsites': allsites,
 302             'hitcount': totalhits,
 303             'firsthit': firsthit,
 304             'lasthit': min(totalhits, firsthit + hitsperpage - 1),
 305             'query': request.GET['q'],
 306             'pagelinks': "&nbsp;".join(
 307                 generate_pagelinks(pagenum,
 308                                    totalhits // hitsperpage + 1,
 309                                    querystr)),
 310             'hits': [{
 311                 'title': h[3],
 312                 'url': "%s%s" % (h[1], h[2]),
 313                 'abstract': h[4].replace("[[[[[[", "<strong>").replace("]]]]]]", "</strong>"),
 314                 'rank': h[5]} for h in hits[:-1]],
 315         })