xapian-applications/omega/postprocess

   1 #!/usr/bin/env python
   2 # Postprocess click data files.
   3 #
   4 # Copyright (C) 2017 Vivek Pal
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License as
   8 # published by the Free Software Foundation; either version 2 of the
   9 # License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  19 # USA
  20
  21 """Postprocesses search and clicks data files.
  22
  23 Generates the final clickstream log file and query file for Xapian Letor
  24 module from that log file.
  25 """
  26
  27 from __future__ import print_function
  28
  29 import argparse
  30 import collections
  31 import csv
  32 import os
  33 import sys
  34 import tempfile
  35
  36
  37 def generate_combined_log(search_log, clicks_log, final_log):
  38     """Generates the final log file.
  39
  40     Input Args:
  41         search_log (str): Path to the search log file.
  42         clicks_log (str): Path to the clicks log file.
  43         final_log (str): Path to the final log file.
  44
  45     Example (comma-delimited) entries in search_log:
  46
  47     821f03288846297c2cf43c34766a38f7,"book","45,54",0
  48     d41d8cd98f00b204e9800998ecf8427e,"","".0
  49     d41d8cd98f00b204e9800998ecf8427e,"",""0
  50     098f6bcd4621d373cade4e832627b4f6,"test","35,47",0
  51
  52     Example (comma-delimited) entries in clicks_log:
  53
  54     821f03288846297c2cf43c34766a38f7,54
  55     821f03288846297c2cf43c34766a38f7,54
  56     098f6bcd4621d373cade4e832627b4f6,35
  57
  58     Example (comma-delimited) entries in final_log:
  59
  60     QueryID,Query,Hits,Offset,Clicks
  61     821f03288846297c2cf43c34766a38f7,book,"45,54",0,"45:0,54:2"
  62     098f6bcd4621d373cade4e832627b4f6,test,"35,47",0,"35:1,47:0"
  63     """
  64     QUERYID, QUERY, HITS = 0, 1, 2
  65     DOCID = 1
  66
  67     qid_to_clicks = collections.defaultdict(dict)
  68     did_to_count = {}
  69     clicklist = []
  70
  71     with open(clicks_log, 'r') as clicks_f:
  72         clicks_reader = csv.reader(clicks_f)
  73
  74         # Build map: qid_to_clicks = {qid: {did: click_count}}
  75         for row in clicks_reader:
  76             qid, did = row[QUERYID], row[DOCID]
  77
  78             did_to_count = qid_to_clicks[qid]
  79
  80             # Check if did is already present in did_to_count
  81             if did in did_to_count:
  82                 # Update did_to_count[did]
  83                 did_to_count[did] += 1
  84             else:
  85                 did_to_count[did] = 1
  86
  87             qid_to_clicks[qid] = did_to_count
  88
  89     with open(search_log, 'r') as search_f, open(final_log, 'w+') as final_f:
  90         search_reader = csv.reader(search_f)
  91         writer = csv.writer(final_f)
  92
  93         # Add headers to final log file
  94         writer.writerow(["QueryID", "Query", "Hits", "Offset", "Clicks"])
  95
  96         queries = set()
  97
  98         for row in search_reader:
  99             # Skip rows with empty Query string or empty Hitlist
 100             if row[QUERY] == '' or row[HITS] == "":
 101                 continue
 102
 103             # Avoid duplicate entries
 104             if row[QUERY] in queries:
 105                 continue
 106
 107             queries.add(row[QUERY])
 108
 109             # Convert Hitlist from str to list
 110             if not row[HITS]:
 111                 hits = []
 112             else:
 113                 hits = row[HITS]
 114                 hits = hits.strip().split(',')
 115                 row[HITS] = hits
 116
 117             clicklist = hits[:]
 118
 119             # Update clicklist with click values stored in map.
 120             if row[QUERYID] in qid_to_clicks:
 121                 did_to_count = qid_to_clicks[row[QUERYID]]
 122                 for index, did in enumerate(clicklist):
 123                     if did in did_to_count:
 124                         clicklist[index] = '%s:%i' % (did, did_to_count[did])
 125                     else:
 126                         clicklist[index] = did + ':0'
 127             else:
 128                 for index, did in enumerate(clicklist):
 129                     clicklist[index] = did + ':0'
 130
 131             # Serialise "Hits" and "Clicks"
 132             row[HITS] = ','.join(row[HITS])
 133             clicklist = ','.join(clicklist)
 134
 135             row.append(clicklist)
 136             writer.writerow(row)
 137
 138
 139 def generate_query_file(final_log, query_file):
 140     """Generates query file formatted as per Xapian Letor documentation.
 141
 142     Input Args:
 143         final_log (string): Path to final log file.
 144         query_file (string): Path to query file.
 145
 146     Example (comma-delimited) entries in final_log:
 147
 148     QueryID,Query,Hits,Offset,Clicks
 149     821f03288846297c2cf43c34766a38f7,book,"45,54",0,"45:0,54:2"
 150     098f6bcd4621d373cade4e832627b4f6,test,"35,47",0,"35:1,47:0"
 151
 152     Example (comma-delimited) entries in query_file:
 153
 154     821f03288846297c2cf43c34766a38f7,book
 155     098f6bcd4621d373cade4e832627b4f6,test
 156     """
 157     with open(final_log, 'r') as s, open(query_file, 'w+') as w:
 158         reader = csv.DictReader(s)
 159         writer = csv.writer(w)
 160
 161         for row in reader:
 162             writer.writerow([row['QueryID'], row['Query']])
 163
 164
 165 def test_functions():
 166     try:
 167         test_search = tempfile.NamedTemporaryFile(delete=True)
 168
 169         test_search.write('821f03288846297c2cf43c34766a38f7,"book","45,54",0\n')
 170         test_search.write('821f03288846297c2cf43c34766a38f7,"book","45,54",0\n')
 171         test_search.write('d41d8cd98f00b204e9800998ecf8427e,"","",0\n')
 172         test_search.write('098f6bcd4621d373cade4e832627b4f6,"test","35,47",0\n')
 173         test_search.flush()
 174
 175         test_clicks = tempfile.NamedTemporaryFile(delete=True)
 176
 177         test_clicks.write('821f03288846297c2cf43c34766a38f7,54\n')
 178         test_clicks.write('821f03288846297c2cf43c34766a38f7,54\n')
 179         test_clicks.write('098f6bcd4621d373cade4e832627b4f6,35\n')
 180         test_clicks.flush()
 181
 182         test_final = tempfile.NamedTemporaryFile(delete=True)
 183         generate_combined_log(test_search.name, test_clicks.name, test_final.name)
 184         test_final.flush()
 185
 186         test_query = tempfile.NamedTemporaryFile(delete=True)
 187         generate_query_file(test_final.name, test_query.name)
 188         test_query.flush()
 189
 190         # Test entries in final log are correct.
 191         with open(test_final.name, 'r') as final_f:
 192             reader = csv.reader(final_f)
 193
 194             # Skip header row.
 195             next(final_f)
 196
 197             for i, row in enumerate(reader):
 198                 if i == 0:
 199                     assert row == ['821f03288846297c2cf43c34766a38f7',
 200                                   'book', '45,54', '0', '45:0,54:2'], "Incorrect entry in final.log"
 201                 if i == 1:
 202                     assert row == ['098f6bcd4621d373cade4e832627b4f6',
 203                                   'test', '35,47', '0', '35:1,47:0'], "Incorrect entry in final.log"
 204
 205         # Test entries in query file are correct.
 206         with open(test_query.name, 'r') as query_f:
 207             reader = csv.reader(query_f)
 208
 209             for i, row in enumerate(reader):
 210                 if i == 0:
 211                     assert row == ['821f03288846297c2cf43c34766a38f7','book'], "Incorrect entry in query.txt"
 212                 if i == 1:
 213                     assert row == ['098f6bcd4621d373cade4e832627b4f6','test'], "Incorrect entry in query.txt"
 214     finally:
 215         # Close all files to safely delete them.
 216         try:
 217             test_search.close()
 218         except IOError as e:
 219             print("Error: %s - %s." % (e.filename, e.strerror))
 220         try:
 221             test_clicks.close()
 222         except IOError as e:
 223             print("Error: %s - %s." % (e.filename, e.strerror))
 224         try:
 225             test_final.close()
 226         except IOError as e:
 227             print("Error: %s - %s." % (e.filename, e.strerror))
 228         try:
 229             test_query.close()
 230         except IOError as e:
 231             print("Error: %s - %s." % (e.filename, e.strerror))
 232
 233
 234 if __name__ == '__main__':
 235     parser = argparse.ArgumentParser(
 236         description='''Postprocess click data files.
 237
 238 This script generates the final clickstream log file from input search and
 239 click log files and creates query file that can be used by the Xapian Letor
 240 module for generating its training files.''',
 241 formatter_class=argparse.RawTextHelpFormatter)
 242     parser.add_argument("search_log", type=str, help="Path to the search.log file.")
 243     parser.add_argument("clicks_log", type=str, help="Path to the clicks.log file.")
 244     parser.add_argument("final_log", type=str, help="Path to save final.log file.")
 245     parser.add_argument("query_file", type=str, help="Path to save query.txt file.")
 246     parser.add_argument("--test", help="Run tests for this script.", action='store_true')
 247     args = parser.parse_args()
 248
 249     if args.test:
 250         test_functions()
 251         exit()
 252
 253     try:
 254         generate_combined_log(args.search_log, args.clicks_log, args.final_log)
 255         generate_query_file(args.final_log, args.query_file)
 256     except IOError as e:
 257         print(e, file=sys.stderr)