[ci] Fix netbsd job to upgrade existing packages
[xapian.git] / xapian-applications / omega / postprocess
blob021a19d2636b4f2a1ee3c0db09b6b7eaa2ba460e
1 #!/usr/bin/env python
2 # Postprocess click data files.
4 # Copyright (C) 2017 Vivek Pal
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License as
8 # published by the Free Software Foundation; either version 2 of the
9 # License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 # USA
21 """Postprocesses search and clicks data files.
23 Generates the final clickstream log file and query file for Xapian Letor
24 module from that log file.
25 """
27 from __future__ import print_function
29 import argparse
30 import collections
31 import csv
32 import os
33 import sys
34 import tempfile
37 def generate_combined_log(search_log, clicks_log, final_log):
38 """Generates the final log file.
40 Input Args:
41 search_log (str): Path to the search log file.
42 clicks_log (str): Path to the clicks log file.
43 final_log (str): Path to the final log file.
45 Example (comma-delimited) entries in search_log:
47 821f03288846297c2cf43c34766a38f7,"book","45,54",0
48 d41d8cd98f00b204e9800998ecf8427e,"","".0
49 d41d8cd98f00b204e9800998ecf8427e,"",""0
50 098f6bcd4621d373cade4e832627b4f6,"test","35,47",0
52 Example (comma-delimited) entries in clicks_log:
54 821f03288846297c2cf43c34766a38f7,54
55 821f03288846297c2cf43c34766a38f7,54
56 098f6bcd4621d373cade4e832627b4f6,35
58 Example (comma-delimited) entries in final_log:
60 QueryID,Query,Hits,Offset,Clicks
61 821f03288846297c2cf43c34766a38f7,book,"45,54",0,"45:0,54:2"
62 098f6bcd4621d373cade4e832627b4f6,test,"35,47",0,"35:1,47:0"
63 """
64 QUERYID, QUERY, HITS = 0, 1, 2
65 DOCID = 1
67 qid_to_clicks = collections.defaultdict(dict)
68 did_to_count = {}
69 clicklist = []
71 with open(clicks_log, 'r') as clicks_f:
72 clicks_reader = csv.reader(clicks_f)
74 # Build map: qid_to_clicks = {qid: {did: click_count}}
75 for row in clicks_reader:
76 qid, did = row[QUERYID], row[DOCID]
78 did_to_count = qid_to_clicks[qid]
80 # Check if did is already present in did_to_count
81 if did in did_to_count:
82 # Update did_to_count[did]
83 did_to_count[did] += 1
84 else:
85 did_to_count[did] = 1
87 qid_to_clicks[qid] = did_to_count
89 with open(search_log, 'r') as search_f, open(final_log, 'w+') as final_f:
90 search_reader = csv.reader(search_f)
91 writer = csv.writer(final_f)
93 # Add headers to final log file
94 writer.writerow(["QueryID", "Query", "Hits", "Offset", "Clicks"])
96 queries = set()
98 for row in search_reader:
99 # Skip rows with empty Query string or empty Hitlist
100 if row[QUERY] == '' or row[HITS] == "":
101 continue
103 # Avoid duplicate entries
104 if row[QUERY] in queries:
105 continue
107 queries.add(row[QUERY])
109 # Convert Hitlist from str to list
110 if not row[HITS]:
111 hits = []
112 else:
113 hits = row[HITS]
114 hits = hits.strip().split(',')
115 row[HITS] = hits
117 clicklist = hits[:]
119 # Update clicklist with click values stored in map.
120 if row[QUERYID] in qid_to_clicks:
121 did_to_count = qid_to_clicks[row[QUERYID]]
122 for index, did in enumerate(clicklist):
123 if did in did_to_count:
124 clicklist[index] = '%s:%i' % (did, did_to_count[did])
125 else:
126 clicklist[index] = did + ':0'
127 else:
128 for index, did in enumerate(clicklist):
129 clicklist[index] = did + ':0'
131 # Serialise "Hits" and "Clicks"
132 row[HITS] = ','.join(row[HITS])
133 clicklist = ','.join(clicklist)
135 row.append(clicklist)
136 writer.writerow(row)
139 def generate_query_file(final_log, query_file):
140 """Generates query file formatted as per Xapian Letor documentation.
142 Input Args:
143 final_log (string): Path to final log file.
144 query_file (string): Path to query file.
146 Example (comma-delimited) entries in final_log:
148 QueryID,Query,Hits,Offset,Clicks
149 821f03288846297c2cf43c34766a38f7,book,"45,54",0,"45:0,54:2"
150 098f6bcd4621d373cade4e832627b4f6,test,"35,47",0,"35:1,47:0"
152 Example (comma-delimited) entries in query_file:
154 821f03288846297c2cf43c34766a38f7,book
155 098f6bcd4621d373cade4e832627b4f6,test
157 with open(final_log, 'r') as s, open(query_file, 'w+') as w:
158 reader = csv.DictReader(s)
159 writer = csv.writer(w)
161 for row in reader:
162 writer.writerow([row['QueryID'], row['Query']])
165 def test_functions():
166 try:
167 test_search = tempfile.NamedTemporaryFile(delete=True)
169 test_search.write('821f03288846297c2cf43c34766a38f7,"book","45,54",0\n')
170 test_search.write('821f03288846297c2cf43c34766a38f7,"book","45,54",0\n')
171 test_search.write('d41d8cd98f00b204e9800998ecf8427e,"","",0\n')
172 test_search.write('098f6bcd4621d373cade4e832627b4f6,"test","35,47",0\n')
173 test_search.flush()
175 test_clicks = tempfile.NamedTemporaryFile(delete=True)
177 test_clicks.write('821f03288846297c2cf43c34766a38f7,54\n')
178 test_clicks.write('821f03288846297c2cf43c34766a38f7,54\n')
179 test_clicks.write('098f6bcd4621d373cade4e832627b4f6,35\n')
180 test_clicks.flush()
182 test_final = tempfile.NamedTemporaryFile(delete=True)
183 generate_combined_log(test_search.name, test_clicks.name, test_final.name)
184 test_final.flush()
186 test_query = tempfile.NamedTemporaryFile(delete=True)
187 generate_query_file(test_final.name, test_query.name)
188 test_query.flush()
190 # Test entries in final log are correct.
191 with open(test_final.name, 'r') as final_f:
192 reader = csv.reader(final_f)
194 # Skip header row.
195 next(final_f)
197 for i, row in enumerate(reader):
198 if i == 0:
199 assert row == ['821f03288846297c2cf43c34766a38f7',
200 'book', '45,54', '0', '45:0,54:2'], "Incorrect entry in final.log"
201 if i == 1:
202 assert row == ['098f6bcd4621d373cade4e832627b4f6',
203 'test', '35,47', '0', '35:1,47:0'], "Incorrect entry in final.log"
205 # Test entries in query file are correct.
206 with open(test_query.name, 'r') as query_f:
207 reader = csv.reader(query_f)
209 for i, row in enumerate(reader):
210 if i == 0:
211 assert row == ['821f03288846297c2cf43c34766a38f7','book'], "Incorrect entry in query.txt"
212 if i == 1:
213 assert row == ['098f6bcd4621d373cade4e832627b4f6','test'], "Incorrect entry in query.txt"
214 finally:
215 # Close all files to safely delete them.
216 try:
217 test_search.close()
218 except IOError as e:
219 print("Error: %s - %s." % (e.filename, e.strerror))
220 try:
221 test_clicks.close()
222 except IOError as e:
223 print("Error: %s - %s." % (e.filename, e.strerror))
224 try:
225 test_final.close()
226 except IOError as e:
227 print("Error: %s - %s." % (e.filename, e.strerror))
228 try:
229 test_query.close()
230 except IOError as e:
231 print("Error: %s - %s." % (e.filename, e.strerror))
234 if __name__ == '__main__':
235 parser = argparse.ArgumentParser(
236 description='''Postprocess click data files.
238 This script generates the final clickstream log file from input search and
239 click log files and creates query file that can be used by the Xapian Letor
240 module for generating its training files.''',
241 formatter_class=argparse.RawTextHelpFormatter)
242 parser.add_argument("search_log", type=str, help="Path to the search.log file.")
243 parser.add_argument("clicks_log", type=str, help="Path to the clicks.log file.")
244 parser.add_argument("final_log", type=str, help="Path to save final.log file.")
245 parser.add_argument("query_file", type=str, help="Path to save query.txt file.")
246 parser.add_argument("--test", help="Run tests for this script.", action='store_true')
247 args = parser.parse_args()
249 if args.test:
250 test_functions()
251 exit()
253 try:
254 generate_combined_log(args.search_log, args.clicks_log, args.final_log)
255 generate_query_file(args.final_log, args.query_file)
256 except IOError as e:
257 print(e, file=sys.stderr)