3 usage: gendummydata.py outputfilename.sql
6 # This script seeds the AUR database with dummy data for
7 # use during development/testing. It uses random entries
8 # from /usr/share/dict/words to create user accounts and
9 # package names. It generates the SQL statements to
10 # insert these users/packages into the AUR database.
19 LOG_LEVEL
= logging
.DEBUG
# logging level. set to logging.INFO to reduce output
20 SEED_FILE
= "/usr/share/dict/words"
21 USER_ID
= 5 # Users.ID of first bogus user
22 PKG_ID
= 1 # Packages.ID of first package
23 MAX_USERS
= 76000 # how many users to 'register'
24 MAX_DEVS
= .1 # what percentage of MAX_USERS are Developers
25 MAX_TUS
= .2 # what percentage of MAX_USERS are Trusted Users
26 MAX_PKGS
= 64000 # how many packages to load
27 PKG_DEPS
= (1, 15) # min/max depends a package has
28 PKG_RELS
= (1, 5) # min/max relations a package has
29 PKG_SRC
= (1, 3) # min/max sources a package has
30 PKG_CMNTS
= (1, 5) # min/max number of comments a package has
31 CATEGORIES_COUNT
= 17 # the number of categories from aur-schema
32 VOTING
= (0, .001) # percentage range for package voting
33 OPEN_PROPOSALS
= 5 # number of open trusted user proposals
34 CLOSE_PROPOSALS
= 15 # number of closed trusted user proposals
35 RANDOM_TLDS
= ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
36 RANDOM_URL
= ("http://www.", "ftp://ftp.", "http://", "ftp://")
37 RANDOM_LOCS
= ("pub", "release", "files", "downloads", "src")
38 FORTUNE_FILE
= "/usr/share/fortune/cookie"
41 logformat
= "%(levelname)s: %(message)s"
42 logging
.basicConfig(format
=logformat
, level
=LOG_LEVEL
)
43 log
= logging
.getLogger()
45 if len(sys
.argv
) != 2:
46 log
.error("Missing output filename argument")
49 # make sure the seed file exists
51 if not os
.path
.exists(SEED_FILE
):
52 log
.error("Please install the 'words' Arch package")
55 # make sure comments can be created
57 if not os
.path
.exists(FORTUNE_FILE
):
58 log
.error("Please install the 'fortune-mod' Arch package")
61 # track what users/package names have been used
68 # some functions to generate random data
72 ver
.append("%d" % random
.randrange(0, 10))
73 ver
.append("%d" % random
.randrange(0, 20))
74 if random
.randrange(0, 2) == 0:
75 ver
.append("%d" % random
.randrange(0, 100))
76 return ".".join(ver
) + "-%d" % random
.randrange(1, 11)
80 return random
.randrange(1, CATEGORIES_COUNT
)
84 return seen_users
[user_keys
[random
.randrange(0, len(user_keys
))]]
88 return fortunes
[random
.randrange(0, len(fortunes
))].replace("'", "")
91 # load the words, and make sure there are enough words for users/pkgs
93 log
.debug("Grabbing words from seed file...")
94 fp
= open(SEED_FILE
, "r", encoding
="utf-8")
95 contents
= fp
.readlines()
97 if MAX_USERS
> len(contents
):
98 MAX_USERS
= len(contents
)
99 if MAX_PKGS
> len(contents
):
100 MAX_PKGS
= len(contents
)
101 if len(contents
) - MAX_USERS
> MAX_PKGS
:
106 # select random usernames
108 log
.debug("Generating random user names...")
110 while len(seen_users
) < MAX_USERS
:
111 user
= random
.randrange(0, len(contents
))
112 word
= contents
[user
].replace("'", "").replace(".", "").replace(" ", "_")
113 word
= word
.strip().lower()
114 if word
not in seen_users
:
115 seen_users
[word
] = user_id
117 user_keys
= list(seen_users
.keys())
119 # select random package names
121 log
.debug("Generating random package names...")
123 while len(seen_pkgs
) < MAX_PKGS
:
124 pkg
= random
.randrange(0, len(contents
))
125 word
= contents
[pkg
].replace("'", "").replace(".", "").replace(" ", "_")
126 word
= word
.strip().lower()
128 if word
not in seen_pkgs
and word
not in seen_users
:
129 seen_pkgs
[word
] = num_pkgs
132 if word
not in seen_pkgs
:
133 seen_pkgs
[word
] = num_pkgs
136 # free up contents memory
147 # Just let python throw the errors if any happen
149 out
= open(sys
.argv
[1], "w", encoding
="utf-8")
150 out
.write("BEGIN;\n")
152 # Begin by creating the User statements
154 log
.debug("Creating SQL statements for users.")
156 account_type
= 1 # default to normal user
157 if not has_devs
or not has_tus
:
158 account_type
= random
.randrange(1, 4)
159 if account_type
== 3 and not has_devs
:
160 # this will be a dev account
162 developers
.append(seen_users
[u
])
163 if len(developers
) >= MAX_DEVS
* MAX_USERS
:
165 elif account_type
== 2 and not has_tus
:
166 # this will be a trusted user account
168 trustedusers
.append(seen_users
[u
])
169 if len(trustedusers
) >= MAX_TUS
* MAX_USERS
:
172 # a normal user account
176 h
= hashlib
.new('md5')
178 s
= ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
179 " VALUES (%d, %d, '%s', '%s@example.com', '%s');\n")
180 s
= s
% (seen_users
[u
], account_type
, u
, u
, h
.hexdigest())
183 log
.debug("Number of developers: %d" % len(developers
))
184 log
.debug("Number of trusted users: %d" % len(trustedusers
))
185 log
.debug("Number of users: %d" % (MAX_USERS
-len(developers
)-len(trustedusers
)))
186 log
.debug("Number of packages: %d" % MAX_PKGS
)
188 log
.debug("Gathering text from fortune file...")
189 fp
= open(FORTUNE_FILE
, "r", encoding
="utf-8")
190 fortunes
= fp
.read().split("%\n")
193 # Create the package statements
195 log
.debug("Creating SQL statements for packages.")
197 for p
in list(seen_pkgs
.keys()):
198 NOW
= int(time
.time())
200 muid
= developers
[random
.randrange(0, len(developers
))]
201 puid
= developers
[random
.randrange(0, len(developers
))]
203 muid
= trustedusers
[random
.randrange(0, len(trustedusers
))]
204 puid
= trustedusers
[random
.randrange(0, len(trustedusers
))]
205 if count
% 20 == 0: # every so often, there are orphans...
208 uuid
= genUID() # the submitter/user
210 s
= ("INSERT INTO PackageBases (ID, Name, FlaggerComment, SubmittedTS, ModifiedTS, "
211 "SubmitterUID, MaintainerUID, PackagerUID) VALUES (%d, '%s', '', %d, %d, %d, %s, %s);\n")
212 s
= s
% (seen_pkgs
[p
], p
, NOW
, NOW
, uuid
, muid
, puid
)
215 s
= ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
216 "(%d, %d, '%s', '%s');\n")
217 s
= s
% (seen_pkgs
[p
], seen_pkgs
[p
], p
, genVersion())
222 # create random comments for this package
224 num_comments
= random
.randrange(PKG_CMNTS
[0], PKG_CMNTS
[1])
225 for i
in range(0, num_comments
):
226 now
= NOW
+ random
.randrange(400, 86400*3)
227 s
= ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
228 " Comments, RenderedComment, CommentTS) VALUES (%d, %d, '%s', '', %d);\n")
229 s
= s
% (seen_pkgs
[p
], genUID(), genFortune(), now
)
235 log
.debug("Casting votes for packages.")
237 num_votes
= random
.randrange(int(len(seen_pkgs
)*VOTING
[0]),
238 int(len(seen_pkgs
)*VOTING
[1]))
240 for v
in range(num_votes
):
241 pkg
= random
.randrange(1, len(seen_pkgs
) + 1)
242 if pkg
not in pkgvote
:
243 s
= ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
244 " VALUES (%d, %d);\n")
245 s
= s
% (seen_users
[u
], pkg
)
247 if pkg
not in track_votes
:
249 track_votes
[pkg
] += 1
252 # Update statements for package votes
254 for p
in list(track_votes
.keys()):
255 s
= "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
256 s
= s
% (track_votes
[p
], p
)
259 # Create package dependencies and sources
261 log
.debug("Creating statements for package depends/sources.")
262 # the keys of seen_pkgs are accessed many times by random.choice,
263 # so the list has to be created outside the loops to keep it efficient
264 seen_pkgs_keys
= list(seen_pkgs
.keys())
265 for p
in seen_pkgs_keys
:
266 num_deps
= random
.randrange(PKG_DEPS
[0], PKG_DEPS
[1])
267 for i
in range(0, num_deps
):
268 dep
= random
.choice(seen_pkgs_keys
)
269 deptype
= random
.randrange(1, 5)
271 dep
+= ": for " + random
.choice(seen_pkgs_keys
)
272 s
= "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
273 s
= s
% (seen_pkgs
[p
], deptype
, dep
)
276 num_rels
= random
.randrange(PKG_RELS
[0], PKG_RELS
[1])
277 for i
in range(0, num_deps
):
278 rel
= random
.choice(seen_pkgs_keys
)
279 reltype
= random
.randrange(1, 4)
280 s
= "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
281 s
= s
% (seen_pkgs
[p
], reltype
, rel
)
284 num_sources
= random
.randrange(PKG_SRC
[0], PKG_SRC
[1])
285 for i
in range(num_sources
):
286 src_file
= user_keys
[random
.randrange(0, len(user_keys
))]
287 src
= "%s%s.%s/%s/%s-%s.tar.gz" % (
288 RANDOM_URL
[random
.randrange(0, len(RANDOM_URL
))],
289 p
, RANDOM_TLDS
[random
.randrange(0, len(RANDOM_TLDS
))],
290 RANDOM_LOCS
[random
.randrange(0, len(RANDOM_LOCS
))],
291 src_file
, genVersion())
292 s
= "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
293 s
= s
% (seen_pkgs
[p
], src
)
296 # Create trusted user proposals
298 log
.debug("Creating SQL statements for trusted user proposals.")
300 for t
in range(0, OPEN_PROPOSALS
+CLOSE_PROPOSALS
):
301 now
= int(time
.time())
302 if count
< CLOSE_PROPOSALS
:
303 start
= now
- random
.randrange(3600*24*7, 3600*24*21)
304 end
= now
- random
.randrange(0, 3600*24*7)
307 end
= now
+ random
.randrange(3600*24, 3600*24*7)
308 if count
% 5 == 0: # Don't make the vote about anyone once in a while
311 user
= user_keys
[random
.randrange(0, len(user_keys
))]
312 suid
= trustedusers
[random
.randrange(0, len(trustedusers
))]
313 s
= ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
314 " Quorum, SubmitterID) VALUES ('%s', '%s', %d, %d, 0.0, %d);\n")
315 s
= s
% (genFortune(), user
, start
, end
, suid
)
321 out
.write("COMMIT;\n")