kgs fetch module
[gostyle.git] / kgs / kgs.py
blobcc2a8d665b46ccf5c4a20045d3061a6ab27e3ec7
1 # -*- coding: utf-8 -*-
3 import datetime
4 import logging
5 import os
6 import random
7 import re
8 import subprocess
9 import tarfile
10 import tempfile
11 import time
12 import urllib2
15 class KGSError(RuntimeError):
16 pass
18 class KGSNotFound(KGSError):
19 pass
21 class KGS(object):
22 """Object used to fetch data from KGS.
24 The requests and results are cached, real requests are delayed not to
25 contact the server too often.
27 """
28 def __init__(self, cache_dir, min_delay=5):
29 """
30 Arguments:
31 cache_dir -- where to save the requests and results, created if does
32 not exist
33 min_delay -- min_delay between requests in seconds
34 """
35 # the time of the last request
36 self.last = 0
38 # between requests
39 self.min_delay = min_delay
41 self.cache_dir = cache_dir
43 def fetch_archive_and_extract(self, player, year, month):
44 """
45 Fetches games of @player, from @year and @month, extracts the archive
46 and returns the directory with games.
47 """
48 basename = '%s-%d-%d.tar.gz' % (player, int(year), int(month))
49 url = 'http://www.gokgs.com/servlet/archives/en_US/' + basename
51 ## FETCH ARCHIVE
52 archive_dir = os.path.join(self.cache_dir, 'ARCHIVES', str(year), str(month))
53 if not os.path.isdir(archive_dir):
54 os.makedirs(archive_dir)
56 archive_file = os.path.join(archive_dir, basename)
57 if not os.path.exists(archive_file):
58 self.wait_min_delay()
59 logging.info("Fetching KGS archive '%s'"%(url))
60 try:
61 try:
62 response = urllib2.urlopen(url)
63 except urllib2.HTTPError as e:
64 if e.code == 404:
65 raise KGSNotFound()
66 if e.code == 503:
67 raise KGSError("KGS quota exceeded. Please download your latest games manually and upload them as an archive.")
68 raise
69 except urllib2.URLError as e:
70 raise KGSError("Fetching the KGS archive failed: '%s'\n %s"%(e, url))
71 finally:
72 self.update_last_time()
73 with open(archive_file, 'w') as archive:
74 archive.write(response.read())
76 ## EXTRACT
77 games_dir = os.path.join(self.cache_dir, 'GAMES', str(year), str(month), str(player))
78 if not os.path.isdir(games_dir):
79 os.makedirs(games_dir)
81 tf = tarfile.open(archive_file)
82 tf.extractall(games_dir)
83 tf.close()
85 return games_dir
87 def list_games(self, player, year, month):
88 """ Returns list of tuples
89 [ (playername, rank), ..]
90 there will be two tuples for one game even 19x19 game (white and black player) of
91 @player in @year / @month
93 only regard 19x19 even games
94 """
95 ret_games, ret_active = self._player_archive(player, year, month)
96 return ret_games
98 def list_active(self, player):
99 """
100 Returns list of tuples:
101 [ (player, year, month), ] such that player was active in the year and month.
103 today = datetime.datetime.today()
104 year, month = today.year, today.month
106 ret_games, ret_active = self._player_archive(player, year, month)
108 if not ret_active:
109 raise KGSError("Not an active user '%s'."%player)
110 return ret_active
112 def wait_min_delay(self):
113 diff = time.time() - self.last
114 if diff < self.min_delay:
115 time.sleep(self.min_delay - diff + random.random())
117 def update_last_time(self):
118 self.last = time.time()
120 def _player_archive(self, player, year, month):
121 assert re.match('^[0-9a-zA-Z]*$', player)
122 assert int(year)
123 assert int(month)
125 tmpname = tempfile.mktemp('kgs_fetch')
126 wget_file_dir = os.path.join(self.cache_dir, 'LIST', str(year), str(month))
127 if not os.path.isdir(wget_file_dir):
128 os.makedirs(wget_file_dir)
129 wget_file = os.path.join(wget_file_dir, player)
131 script= u"""
132 wget_outfile=%s
133 tmp_file=%s
135 [ -e $wget_outfile ] || {
137 wget --user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1" \
138 "http://www.gokgs.com/gameArchives.jsp?user=%s&year=%s&month=%s" -O $wget_outfile
142 [ -e $wget_outfile ] || { exit 1; }
145 sed 's/<tr>/\\n&/g' $wget_outfile > $tmp_file
147 # even games in the month
148 sed '/<td>19×19 <\/td>/!d' $tmp_file > ${tmp_file}.games
149 sed -i 's/<td>/\\n&/g' ${tmp_file}.games
150 sed -i -n 's#.*<a href="gameArchives.jsp?user=\([a-zA-Z0-9]*\)">\\1 \[\([0-9]*[kd][?]\{0,1\}\)\]</a>.*#\\1 \\2#p' ${tmp_file}.games
152 # active months
153 sed -i 's/<td>/\\n&/g' $tmp_file
154 sed -n -i 's#.*href="gameArchives.jsp?user=\([a-zA-Z0-9]*\)&amp;year=\([0-9]*\)&amp;month=\([0-9]*\)">.*#\\2 \\3#p' $tmp_file
156 """%( wget_file, tmpname, player, year, month)
158 #print script
159 #import sys
160 #sys.exit()
162 # if the target file does not exist
163 # = it is not cached => we will make a request
164 if not os.path.exists(wget_file):
165 logging.info("Fetching KGS list of active months for '%s'"%(player))
166 self.wait_min_delay()
168 retcode = subprocess.call(script, shell=True)
169 self.update_last_time()
170 if retcode:
171 raise RuntimeError("Fetching KGS games failed.")
173 with open(tmpname + '.games', 'r') as fin:
174 data = fin.readlines()
176 os.unlink(tmpname +'.games')
177 ret_games = []
178 for line in data:
179 if line[-1] == '\n':
180 line = line[:-1]
181 player, rank = line.split()
182 ret_games.append((player, rank))
184 with open(tmpname, 'r') as fin:
185 data = fin.readlines()
187 os.unlink(tmpname)
188 ret_active = []
189 for line in data:
190 if line[-1] == '\n':
191 line = line[:-1]
192 lyear, lmonth = line.split()
193 ret_active.append((lyear, lmonth))
195 if ret_games:
196 current = (str(year), str(month))
197 if not current in ret_active:
198 ret_active.append(current)
199 else:
200 logging.warn("Current month already in the list of active months..")
202 ret = (ret_games, ret_active)
203 return ret
205 if __name__ == "__main__":
207 kgs = KGS("OUT")
209 print kgs.list_active('bronislav')