gendummydata.py: Fix to make it less db specific
[aur.git] / schema / gendummydata.py
blob373f82ea861d109c1bb3862ba69c6fb80e70498a
1 #!/usr/bin/python3
2 """
3 usage: gendummydata.py outputfilename.sql
4 """
6 # This script seeds the AUR database with dummy data for
7 # use during development/testing. It uses random entries
8 # from /usr/share/dict/words to create user accounts and
9 # package names. It generates the SQL statements to
10 # insert these users/packages into the AUR database.
12 import hashlib
13 import random
14 import time
15 import os
16 import sys
17 import io
18 import logging
20 LOG_LEVEL = logging.DEBUG # logging level. set to logging.INFO to reduce output
21 SEED_FILE = "/usr/share/dict/words"
22 DB_HOST = os.getenv("DB_HOST", "localhost")
23 DB_NAME = os.getenv("DB_NAME", "AUR")
24 DB_USER = os.getenv("DB_USER", "aur")
25 DB_PASS = os.getenv("DB_PASS", "aur")
26 USER_ID = 5 # Users.ID of first bogus user
27 PKG_ID = 1 # Packages.ID of first package
28 MAX_USERS = 300 # how many users to 'register'
29 MAX_DEVS = .1 # what percentage of MAX_USERS are Developers
30 MAX_TUS = .2 # what percentage of MAX_USERS are Trusted Users
31 MAX_PKGS = 900 # how many packages to load
32 PKG_DEPS = (1, 15) # min/max depends a package has
33 PKG_RELS = (1, 5) # min/max relations a package has
34 PKG_SRC = (1, 3) # min/max sources a package has
35 PKG_CMNTS = (1, 5) # min/max number of comments a package has
36 CATEGORIES_COUNT = 17 # the number of categories from aur-schema
37 VOTING = (0, .30) # percentage range for package voting
38 OPEN_PROPOSALS = 5 # number of open trusted user proposals
39 CLOSE_PROPOSALS = 15 # number of closed trusted user proposals
40 RANDOM_TLDS = ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
41 RANDOM_URL = ("http://www.", "ftp://ftp.", "http://", "ftp://")
42 RANDOM_LOCS = ("pub", "release", "files", "downloads", "src")
43 FORTUNE_FILE = "/usr/share/fortune/cookie"
45 # setup logging
46 logformat = "%(levelname)s: %(message)s"
47 logging.basicConfig(format=logformat, level=LOG_LEVEL)
48 log = logging.getLogger()
50 if len(sys.argv) != 2:
51 log.error("Missing output filename argument")
52 raise SystemExit
54 # make sure the seed file exists
56 if not os.path.exists(SEED_FILE):
57 log.error("Please install the 'words' Arch package")
58 raise SystemExit
60 # make sure comments can be created
62 if not os.path.exists(FORTUNE_FILE):
63 log.error("Please install the 'fortune-mod' Arch package")
64 raise SystemExit
66 # track what users/package names have been used
68 seen_users = {}
69 seen_pkgs = {}
70 user_keys = []
72 # some functions to generate random data
74 def genVersion():
75 ver = []
76 ver.append("%d" % random.randrange(0,10))
77 ver.append("%d" % random.randrange(0,20))
78 if random.randrange(0,2) == 0:
79 ver.append("%d" % random.randrange(0,100))
80 return ".".join(ver) + "-%d" % random.randrange(1,11)
81 def genCategory():
82 return random.randrange(1,CATEGORIES_COUNT)
83 def genUID():
84 return seen_users[user_keys[random.randrange(0,len(user_keys))]]
85 def genFortune():
86 return fortunes[random.randrange(0,len(fortunes))].replace("'", "")
89 # load the words, and make sure there are enough words for users/pkgs
91 log.debug("Grabbing words from seed file...")
92 fp = open(SEED_FILE, "r", encoding="utf-8")
93 contents = fp.readlines()
94 fp.close()
95 if MAX_USERS > len(contents):
96 MAX_USERS = len(contents)
97 if MAX_PKGS > len(contents):
98 MAX_PKGS = len(contents)
99 if len(contents) - MAX_USERS > MAX_PKGS:
100 need_dupes = 0
101 else:
102 need_dupes = 1
104 # select random usernames
106 log.debug("Generating random user names...")
107 user_id = USER_ID
108 while len(seen_users) < MAX_USERS:
109 user = random.randrange(0, len(contents))
110 word = contents[user].replace("'", "").replace(".","").replace(" ", "_")
111 word = word.strip().lower()
112 if word not in seen_users:
113 seen_users[word] = user_id
114 user_id += 1
115 user_keys = list(seen_users.keys())
117 # select random package names
119 log.debug("Generating random package names...")
120 num_pkgs = PKG_ID
121 while len(seen_pkgs) < MAX_PKGS:
122 pkg = random.randrange(0, len(contents))
123 word = contents[pkg].replace("'", "").replace(".","").replace(" ", "_")
124 word = word.strip().lower()
125 if not need_dupes:
126 if word not in seen_pkgs and word not in seen_users:
127 seen_pkgs[word] = num_pkgs
128 num_pkgs += 1
129 else:
130 if word not in seen_pkgs:
131 seen_pkgs[word] = num_pkgs
132 num_pkgs += 1
134 # free up contents memory
136 contents = None
138 # developer/tu IDs
140 developers = []
141 trustedusers = []
142 has_devs = 0
143 has_tus = 0
145 # Just let python throw the errors if any happen
147 out = open(sys.argv[1], "w", encoding="utf-8")
148 out.write("BEGIN;\n")
150 # Begin by creating the User statements
152 log.debug("Creating SQL statements for users.")
153 for u in user_keys:
154 account_type = 1 # default to normal user
155 if not has_devs or not has_tus:
156 account_type = random.randrange(1, 4)
157 if account_type == 3 and not has_devs:
158 # this will be a dev account
160 developers.append(seen_users[u])
161 if len(developers) >= MAX_DEVS * MAX_USERS:
162 has_devs = 1
163 elif account_type == 2 and not has_tus:
164 # this will be a trusted user account
166 trustedusers.append(seen_users[u])
167 if len(trustedusers) >= MAX_TUS * MAX_USERS:
168 has_tus = 1
169 else:
170 # a normal user account
172 pass
174 h = hashlib.new('md5')
175 h.update(u.encode());
176 s = ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
177 " VALUES (%d, %d, '%s', '%s@example.com', '%s');\n")
178 s = s % (seen_users[u], account_type, u, u, h.hexdigest())
179 out.write(s)
181 log.debug("Number of developers: %d" % len(developers))
182 log.debug("Number of trusted users: %d" % len(trustedusers))
183 log.debug("Number of users: %d" % (MAX_USERS-len(developers)-len(trustedusers)))
184 log.debug("Number of packages: %d" % MAX_PKGS)
186 log.debug("Gathering text from fortune file...")
187 fp = open(FORTUNE_FILE, "r", encoding="utf-8")
188 fortunes = fp.read().split("%\n")
189 fp.close()
191 # Create the package statements
193 log.debug("Creating SQL statements for packages.")
194 count = 0
195 for p in list(seen_pkgs.keys()):
196 NOW = int(time.time())
197 if count % 2 == 0:
198 muid = developers[random.randrange(0,len(developers))]
199 puid = developers[random.randrange(0,len(developers))]
200 else:
201 muid = trustedusers[random.randrange(0,len(trustedusers))]
202 puid = trustedusers[random.randrange(0,len(trustedusers))]
203 if count % 20 == 0: # every so often, there are orphans...
204 muid = "NULL"
206 uuid = genUID() # the submitter/user
208 s = ("INSERT INTO PackageBases (ID, Name, SubmittedTS, ModifiedTS, "
209 "SubmitterUID, MaintainerUID, PackagerUID) VALUES (%d, '%s', %d, %d, %d, %s, %s);\n")
210 s = s % (seen_pkgs[p], p, NOW, NOW, uuid, muid, puid)
211 out.write(s)
213 s = ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
214 "(%d, %d, '%s', '%s');\n")
215 s = s % (seen_pkgs[p], seen_pkgs[p], p, genVersion())
216 out.write(s)
218 count += 1
220 # create random comments for this package
222 num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
223 for i in range(0, num_comments):
224 now = NOW + random.randrange(400, 86400*3)
225 s = ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
226 " Comments, CommentTS) VALUES (%d, %d, '%s', %d);\n")
227 s = s % (seen_pkgs[p], genUID(), genFortune(), now)
228 out.write(s)
230 # Cast votes
232 track_votes = {}
233 log.debug("Casting votes for packages.")
234 for u in user_keys:
235 num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
236 int(len(seen_pkgs)*VOTING[1]))
237 pkgvote = {}
238 for v in range(num_votes):
239 pkg = random.randrange(1, len(seen_pkgs) + 1)
240 if pkg not in pkgvote:
241 s = ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
242 " VALUES (%d, %d);\n")
243 s = s % (seen_users[u], pkg)
244 pkgvote[pkg] = 1
245 if pkg not in track_votes:
246 track_votes[pkg] = 0
247 track_votes[pkg] += 1
248 out.write(s)
250 # Update statements for package votes
252 for p in list(track_votes.keys()):
253 s = "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
254 s = s % (track_votes[p], p)
255 out.write(s)
257 # Create package dependencies and sources
259 log.debug("Creating statements for package depends/sources.")
260 for p in list(seen_pkgs.keys()):
261 num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
262 for i in range(0, num_deps):
263 dep = random.choice([k for k in seen_pkgs])
264 deptype = random.randrange(1, 5)
265 if deptype == 4:
266 dep += ": for " + random.choice([k for k in seen_pkgs])
267 s = "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
268 s = s % (seen_pkgs[p], deptype, dep)
269 out.write(s)
271 num_rels = random.randrange(PKG_RELS[0], PKG_RELS[1])
272 for i in range(0, num_deps):
273 rel = random.choice([k for k in seen_pkgs])
274 reltype = random.randrange(1, 4)
275 s = "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
276 s = s % (seen_pkgs[p], reltype, rel)
277 out.write(s)
279 num_sources = random.randrange(PKG_SRC[0], PKG_SRC[1])
280 for i in range(num_sources):
281 src_file = user_keys[random.randrange(0, len(user_keys))]
282 src = "%s%s.%s/%s/%s-%s.tar.gz" % (
283 RANDOM_URL[random.randrange(0,len(RANDOM_URL))],
284 p, RANDOM_TLDS[random.randrange(0,len(RANDOM_TLDS))],
285 RANDOM_LOCS[random.randrange(0,len(RANDOM_LOCS))],
286 src_file, genVersion())
287 s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
288 s = s % (seen_pkgs[p], src)
289 out.write(s)
291 # Create trusted user proposals
293 log.debug("Creating SQL statements for trusted user proposals.")
294 count=0
295 for t in range(0, OPEN_PROPOSALS+CLOSE_PROPOSALS):
296 now = int(time.time())
297 if count < CLOSE_PROPOSALS:
298 start = now - random.randrange(3600*24*7, 3600*24*21)
299 end = now - random.randrange(0, 3600*24*7)
300 else:
301 start = now
302 end = now + random.randrange(3600*24, 3600*24*7)
303 if count % 5 == 0: # Don't make the vote about anyone once in a while
304 user = ""
305 else:
306 user = user_keys[random.randrange(0,len(user_keys))]
307 suid = trustedusers[random.randrange(0,len(trustedusers))]
308 s = ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
309 " Quorum, SubmitterID) VALUES ('%s', '%s', %d, %d, 0.0, %d);\n")
310 s = s % (genFortune(), user, start, end, suid)
311 out.write(s)
312 count += 1
314 # close output file
316 out.write("COMMIT;\n")
317 out.write("\n")
318 out.close()
319 log.debug("Done.")