Update wiki links to the new short URL
[aur.git] / schema / gendummydata.py
blobc7b3a06dc0caaa0d08df21627b2f5a120d75f539
1 #!/usr/bin/env python3
2 """
3 usage: gendummydata.py outputfilename.sql
4 """
6 # This script seeds the AUR database with dummy data for
7 # use during development/testing. It uses random entries
8 # from /usr/share/dict/words to create user accounts and
9 # package names. It generates the SQL statements to
10 # insert these users/packages into the AUR database.
12 import hashlib
13 import logging
14 import os
15 import random
16 import sys
17 import time
19 LOG_LEVEL = logging.DEBUG # logging level. set to logging.INFO to reduce output
20 SEED_FILE = "/usr/share/dict/words"
21 USER_ID = 5 # Users.ID of first bogus user
22 PKG_ID = 1 # Packages.ID of first package
23 MAX_USERS = 76000 # how many users to 'register'
24 MAX_DEVS = .1 # what percentage of MAX_USERS are Developers
25 MAX_TUS = .2 # what percentage of MAX_USERS are Trusted Users
26 MAX_PKGS = 64000 # how many packages to load
27 PKG_DEPS = (1, 15) # min/max depends a package has
28 PKG_RELS = (1, 5) # min/max relations a package has
29 PKG_SRC = (1, 3) # min/max sources a package has
30 PKG_CMNTS = (1, 5) # min/max number of comments a package has
31 CATEGORIES_COUNT = 17 # the number of categories from aur-schema
32 VOTING = (0, .001) # percentage range for package voting
33 OPEN_PROPOSALS = 5 # number of open trusted user proposals
34 CLOSE_PROPOSALS = 15 # number of closed trusted user proposals
35 RANDOM_TLDS = ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
36 RANDOM_URL = ("http://www.", "ftp://ftp.", "http://", "ftp://")
37 RANDOM_LOCS = ("pub", "release", "files", "downloads", "src")
38 FORTUNE_FILE = "/usr/share/fortune/cookie"
40 # setup logging
41 logformat = "%(levelname)s: %(message)s"
42 logging.basicConfig(format=logformat, level=LOG_LEVEL)
43 log = logging.getLogger()
45 if len(sys.argv) != 2:
46 log.error("Missing output filename argument")
47 raise SystemExit(1)
49 # make sure the seed file exists
51 if not os.path.exists(SEED_FILE):
52 log.error("Please install the 'words' Arch package")
53 raise SystemExit(1)
55 # make sure comments can be created
57 if not os.path.exists(FORTUNE_FILE):
58 log.error("Please install the 'fortune-mod' Arch package")
59 raise SystemExit(1)
61 # track what users/package names have been used
63 seen_users = {}
64 seen_pkgs = {}
65 user_keys = []
68 # some functions to generate random data
70 def genVersion():
71 ver = []
72 ver.append("%d" % random.randrange(0, 10))
73 ver.append("%d" % random.randrange(0, 20))
74 if random.randrange(0, 2) == 0:
75 ver.append("%d" % random.randrange(0, 100))
76 return ".".join(ver) + "-%d" % random.randrange(1, 11)
79 def genCategory():
80 return random.randrange(1, CATEGORIES_COUNT)
83 def genUID():
84 return seen_users[user_keys[random.randrange(0, len(user_keys))]]
87 def genFortune():
88 return fortunes[random.randrange(0, len(fortunes))].replace("'", "")
91 # load the words, and make sure there are enough words for users/pkgs
93 log.debug("Grabbing words from seed file...")
94 fp = open(SEED_FILE, "r", encoding="utf-8")
95 contents = fp.readlines()
96 fp.close()
97 if MAX_USERS > len(contents):
98 MAX_USERS = len(contents)
99 if MAX_PKGS > len(contents):
100 MAX_PKGS = len(contents)
101 if len(contents) - MAX_USERS > MAX_PKGS:
102 need_dupes = 0
103 else:
104 need_dupes = 1
106 # select random usernames
108 log.debug("Generating random user names...")
109 user_id = USER_ID
110 while len(seen_users) < MAX_USERS:
111 user = random.randrange(0, len(contents))
112 word = contents[user].replace("'", "").replace(".", "").replace(" ", "_")
113 word = word.strip().lower()
114 if word not in seen_users:
115 seen_users[word] = user_id
116 user_id += 1
117 user_keys = list(seen_users.keys())
119 # select random package names
121 log.debug("Generating random package names...")
122 num_pkgs = PKG_ID
123 while len(seen_pkgs) < MAX_PKGS:
124 pkg = random.randrange(0, len(contents))
125 word = contents[pkg].replace("'", "").replace(".", "").replace(" ", "_")
126 word = word.strip().lower()
127 if not need_dupes:
128 if word not in seen_pkgs and word not in seen_users:
129 seen_pkgs[word] = num_pkgs
130 num_pkgs += 1
131 else:
132 if word not in seen_pkgs:
133 seen_pkgs[word] = num_pkgs
134 num_pkgs += 1
136 # free up contents memory
138 contents = None
140 # developer/tu IDs
142 developers = []
143 trustedusers = []
144 has_devs = 0
145 has_tus = 0
147 # Just let python throw the errors if any happen
149 out = open(sys.argv[1], "w", encoding="utf-8")
150 out.write("BEGIN;\n")
152 # Begin by creating the User statements
154 log.debug("Creating SQL statements for users.")
155 for u in user_keys:
156 account_type = 1 # default to normal user
157 if not has_devs or not has_tus:
158 account_type = random.randrange(1, 4)
159 if account_type == 3 and not has_devs:
160 # this will be a dev account
162 developers.append(seen_users[u])
163 if len(developers) >= MAX_DEVS * MAX_USERS:
164 has_devs = 1
165 elif account_type == 2 and not has_tus:
166 # this will be a trusted user account
168 trustedusers.append(seen_users[u])
169 if len(trustedusers) >= MAX_TUS * MAX_USERS:
170 has_tus = 1
171 else:
172 # a normal user account
174 pass
176 h = hashlib.new('md5')
177 h.update(u.encode())
178 s = ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
179 " VALUES (%d, %d, '%s', '%s@example.com', '%s');\n")
180 s = s % (seen_users[u], account_type, u, u, h.hexdigest())
181 out.write(s)
183 log.debug("Number of developers: %d" % len(developers))
184 log.debug("Number of trusted users: %d" % len(trustedusers))
185 log.debug("Number of users: %d" % (MAX_USERS-len(developers)-len(trustedusers)))
186 log.debug("Number of packages: %d" % MAX_PKGS)
188 log.debug("Gathering text from fortune file...")
189 fp = open(FORTUNE_FILE, "r", encoding="utf-8")
190 fortunes = fp.read().split("%\n")
191 fp.close()
193 # Create the package statements
195 log.debug("Creating SQL statements for packages.")
196 count = 0
197 for p in list(seen_pkgs.keys()):
198 NOW = int(time.time())
199 if count % 2 == 0:
200 muid = developers[random.randrange(0, len(developers))]
201 puid = developers[random.randrange(0, len(developers))]
202 else:
203 muid = trustedusers[random.randrange(0, len(trustedusers))]
204 puid = trustedusers[random.randrange(0, len(trustedusers))]
205 if count % 20 == 0: # every so often, there are orphans...
206 muid = "NULL"
208 uuid = genUID() # the submitter/user
210 s = ("INSERT INTO PackageBases (ID, Name, FlaggerComment, SubmittedTS, ModifiedTS, "
211 "SubmitterUID, MaintainerUID, PackagerUID) VALUES (%d, '%s', '', %d, %d, %d, %s, %s);\n")
212 s = s % (seen_pkgs[p], p, NOW, NOW, uuid, muid, puid)
213 out.write(s)
215 s = ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
216 "(%d, %d, '%s', '%s');\n")
217 s = s % (seen_pkgs[p], seen_pkgs[p], p, genVersion())
218 out.write(s)
220 count += 1
222 # create random comments for this package
224 num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
225 for i in range(0, num_comments):
226 now = NOW + random.randrange(400, 86400*3)
227 s = ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
228 " Comments, RenderedComment, CommentTS) VALUES (%d, %d, '%s', '', %d);\n")
229 s = s % (seen_pkgs[p], genUID(), genFortune(), now)
230 out.write(s)
232 # Cast votes
234 track_votes = {}
235 log.debug("Casting votes for packages.")
236 for u in user_keys:
237 num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
238 int(len(seen_pkgs)*VOTING[1]))
239 pkgvote = {}
240 for v in range(num_votes):
241 pkg = random.randrange(1, len(seen_pkgs) + 1)
242 if pkg not in pkgvote:
243 s = ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
244 " VALUES (%d, %d);\n")
245 s = s % (seen_users[u], pkg)
246 pkgvote[pkg] = 1
247 if pkg not in track_votes:
248 track_votes[pkg] = 0
249 track_votes[pkg] += 1
250 out.write(s)
252 # Update statements for package votes
254 for p in list(track_votes.keys()):
255 s = "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
256 s = s % (track_votes[p], p)
257 out.write(s)
259 # Create package dependencies and sources
261 log.debug("Creating statements for package depends/sources.")
262 # the keys of seen_pkgs are accessed many times by random.choice,
263 # so the list has to be created outside the loops to keep it efficient
264 seen_pkgs_keys = list(seen_pkgs.keys())
265 for p in seen_pkgs_keys:
266 num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
267 for i in range(0, num_deps):
268 dep = random.choice(seen_pkgs_keys)
269 deptype = random.randrange(1, 5)
270 if deptype == 4:
271 dep += ": for " + random.choice(seen_pkgs_keys)
272 s = "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
273 s = s % (seen_pkgs[p], deptype, dep)
274 out.write(s)
276 num_rels = random.randrange(PKG_RELS[0], PKG_RELS[1])
277 for i in range(0, num_deps):
278 rel = random.choice(seen_pkgs_keys)
279 reltype = random.randrange(1, 4)
280 s = "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
281 s = s % (seen_pkgs[p], reltype, rel)
282 out.write(s)
284 num_sources = random.randrange(PKG_SRC[0], PKG_SRC[1])
285 for i in range(num_sources):
286 src_file = user_keys[random.randrange(0, len(user_keys))]
287 src = "%s%s.%s/%s/%s-%s.tar.gz" % (
288 RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
289 p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
290 RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
291 src_file, genVersion())
292 s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
293 s = s % (seen_pkgs[p], src)
294 out.write(s)
296 # Create trusted user proposals
298 log.debug("Creating SQL statements for trusted user proposals.")
299 count = 0
300 for t in range(0, OPEN_PROPOSALS+CLOSE_PROPOSALS):
301 now = int(time.time())
302 if count < CLOSE_PROPOSALS:
303 start = now - random.randrange(3600*24*7, 3600*24*21)
304 end = now - random.randrange(0, 3600*24*7)
305 else:
306 start = now
307 end = now + random.randrange(3600*24, 3600*24*7)
308 if count % 5 == 0: # Don't make the vote about anyone once in a while
309 user = ""
310 else:
311 user = user_keys[random.randrange(0, len(user_keys))]
312 suid = trustedusers[random.randrange(0, len(trustedusers))]
313 s = ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
314 " Quorum, SubmitterID) VALUES ('%s', '%s', %d, %d, 0.0, %d);\n")
315 s = s % (genFortune(), user, start, end, suid)
316 out.write(s)
317 count += 1
319 # close output file
321 out.write("COMMIT;\n")
322 out.write("\n")
323 out.close()
324 log.debug("Done.")