3 usage: gendummydata.py outputfilename.sql
6 # This script seeds the AUR database with dummy data for
7 # use during development/testing. It uses random entries
8 # from /usr/share/dict/words to create user accounts and
9 # package names. It generates the SQL statements to
10 # insert these users/packages into the AUR database.
19 LOG_LEVEL
= logging
.DEBUG
# logging level. set to logging.INFO to reduce output
20 SEED_FILE
= "/usr/share/dict/words"
21 DB_HOST
= os
.getenv("DB_HOST", "localhost")
22 DB_NAME
= os
.getenv("DB_NAME", "AUR")
23 DB_USER
= os
.getenv("DB_USER", "aur")
24 DB_PASS
= os
.getenv("DB_PASS", "aur")
25 USER_ID
= 5 # Users.ID of first bogus user
26 PKG_ID
= 1 # Packages.ID of first package
27 MAX_USERS
= 300 # how many users to 'register'
28 MAX_DEVS
= .1 # what percentage of MAX_USERS are Developers
29 MAX_TUS
= .2 # what percentage of MAX_USERS are Trusted Users
30 MAX_PKGS
= 900 # how many packages to load
31 PKG_DEPS
= (1, 15) # min/max depends a package has
32 PKG_RELS
= (1, 5) # min/max relations a package has
33 PKG_SRC
= (1, 3) # min/max sources a package has
34 PKG_CMNTS
= (1, 5) # min/max number of comments a package has
35 CATEGORIES_COUNT
= 17 # the number of categories from aur-schema
36 VOTING
= (0, .30) # percentage range for package voting
37 OPEN_PROPOSALS
= 5 # number of open trusted user proposals
38 CLOSE_PROPOSALS
= 15 # number of closed trusted user proposals
39 RANDOM_TLDS
= ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
40 RANDOM_URL
= ("http://www.", "ftp://ftp.", "http://", "ftp://")
41 RANDOM_LOCS
= ("pub", "release", "files", "downloads", "src")
42 FORTUNE_FILE
= "/usr/share/fortune/cookie"
45 logformat
= "%(levelname)s: %(message)s"
46 logging
.basicConfig(format
=logformat
, level
=LOG_LEVEL
)
47 log
= logging
.getLogger()
49 if len(sys
.argv
) != 2:
50 log
.error("Missing output filename argument")
53 # make sure the seed file exists
55 if not os
.path
.exists(SEED_FILE
):
56 log
.error("Please install the 'words' Arch package")
59 # make sure comments can be created
61 if not os
.path
.exists(FORTUNE_FILE
):
62 log
.error("Please install the 'fortune-mod' Arch package")
65 # track what users/package names have been used
71 # some functions to generate random data
75 ver
.append("%d" % random
.randrange(0,10))
76 ver
.append("%d" % random
.randrange(0,20))
77 if random
.randrange(0,2) == 0:
78 ver
.append("%d" % random
.randrange(0,100))
79 return ".".join(ver
) + "-%d" % random
.randrange(1,11)
81 return random
.randrange(1,CATEGORIES_COUNT
)
83 return seen_users
[user_keys
[random
.randrange(0,len(user_keys
))]]
85 return fortunes
[random
.randrange(0,len(fortunes
))].replace("'", "")
88 # load the words, and make sure there are enough words for users/pkgs
90 log
.debug("Grabbing words from seed file...")
91 fp
= open(SEED_FILE
, "r", encoding
="utf-8")
92 contents
= fp
.readlines()
94 if MAX_USERS
> len(contents
):
95 MAX_USERS
= len(contents
)
96 if MAX_PKGS
> len(contents
):
97 MAX_PKGS
= len(contents
)
98 if len(contents
) - MAX_USERS
> MAX_PKGS
:
103 # select random usernames
105 log
.debug("Generating random user names...")
107 while len(seen_users
) < MAX_USERS
:
108 user
= random
.randrange(0, len(contents
))
109 word
= contents
[user
].replace("'", "").replace(".","").replace(" ", "_")
110 word
= word
.strip().lower()
111 if word
not in seen_users
:
112 seen_users
[word
] = user_id
114 user_keys
= list(seen_users
.keys())
116 # select random package names
118 log
.debug("Generating random package names...")
120 while len(seen_pkgs
) < MAX_PKGS
:
121 pkg
= random
.randrange(0, len(contents
))
122 word
= contents
[pkg
].replace("'", "").replace(".","").replace(" ", "_")
123 word
= word
.strip().lower()
125 if word
not in seen_pkgs
and word
not in seen_users
:
126 seen_pkgs
[word
] = num_pkgs
129 if word
not in seen_pkgs
:
130 seen_pkgs
[word
] = num_pkgs
133 # free up contents memory
144 # Just let python throw the errors if any happen
146 out
= open(sys
.argv
[1], "w", encoding
="utf-8")
147 out
.write("BEGIN;\n")
149 # Begin by creating the User statements
151 log
.debug("Creating SQL statements for users.")
153 account_type
= 1 # default to normal user
154 if not has_devs
or not has_tus
:
155 account_type
= random
.randrange(1, 4)
156 if account_type
== 3 and not has_devs
:
157 # this will be a dev account
159 developers
.append(seen_users
[u
])
160 if len(developers
) >= MAX_DEVS
* MAX_USERS
:
162 elif account_type
== 2 and not has_tus
:
163 # this will be a trusted user account
165 trustedusers
.append(seen_users
[u
])
166 if len(trustedusers
) >= MAX_TUS
* MAX_USERS
:
169 # a normal user account
173 s
= ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
174 " VALUES (%d, %d, '%s', '%s@example.com', MD5('%s'));\n")
175 s
= s
% (seen_users
[u
], account_type
, u
, u
, u
)
178 log
.debug("Number of developers: %d" % len(developers
))
179 log
.debug("Number of trusted users: %d" % len(trustedusers
))
180 log
.debug("Number of users: %d" % (MAX_USERS
-len(developers
)-len(trustedusers
)))
181 log
.debug("Number of packages: %d" % MAX_PKGS
)
183 log
.debug("Gathering text from fortune file...")
184 fp
= open(FORTUNE_FILE
, "r", encoding
="utf-8")
185 fortunes
= fp
.read().split("%\n")
188 # Create the package statements
190 log
.debug("Creating SQL statements for packages.")
192 for p
in list(seen_pkgs
.keys()):
193 NOW
= int(time
.time())
195 muid
= developers
[random
.randrange(0,len(developers
))]
196 puid
= developers
[random
.randrange(0,len(developers
))]
198 muid
= trustedusers
[random
.randrange(0,len(trustedusers
))]
199 puid
= trustedusers
[random
.randrange(0,len(trustedusers
))]
200 if count
% 20 == 0: # every so often, there are orphans...
203 uuid
= genUID() # the submitter/user
205 s
= ("INSERT INTO PackageBases (ID, Name, SubmittedTS, "
206 "SubmitterUID, MaintainerUID, PackagerUID) VALUES (%d, '%s', %d, %d, %s, %s);\n")
207 s
= s
% (seen_pkgs
[p
], p
, NOW
, uuid
, muid
, puid
)
210 s
= ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
211 "(%d, %d, '%s', '%s');\n")
212 s
= s
% (seen_pkgs
[p
], seen_pkgs
[p
], p
, genVersion())
217 # create random comments for this package
219 num_comments
= random
.randrange(PKG_CMNTS
[0], PKG_CMNTS
[1])
220 for i
in range(0, num_comments
):
221 now
= NOW
+ random
.randrange(400, 86400*3)
222 s
= ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
223 " Comments, CommentTS) VALUES (%d, %d, '%s', %d);\n")
224 s
= s
% (seen_pkgs
[p
], genUID(), genFortune(), now
)
230 log
.debug("Casting votes for packages.")
232 num_votes
= random
.randrange(int(len(seen_pkgs
)*VOTING
[0]),
233 int(len(seen_pkgs
)*VOTING
[1]))
235 for v
in range(num_votes
):
236 pkg
= random
.randrange(1, len(seen_pkgs
) + 1)
237 if pkg
not in pkgvote
:
238 s
= ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
239 " VALUES (%d, %d);\n")
240 s
= s
% (seen_users
[u
], pkg
)
242 if pkg
not in track_votes
:
244 track_votes
[pkg
] += 1
247 # Update statements for package votes
249 for p
in list(track_votes
.keys()):
250 s
= "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
251 s
= s
% (track_votes
[p
], p
)
254 # Create package dependencies and sources
256 log
.debug("Creating statements for package depends/sources.")
257 for p
in list(seen_pkgs
.keys()):
258 num_deps
= random
.randrange(PKG_DEPS
[0], PKG_DEPS
[1])
259 for i
in range(0, num_deps
):
260 dep
= random
.choice([k
for k
in seen_pkgs
])
261 deptype
= random
.randrange(1, 5)
263 dep
+= ": for " + random
.choice([k
for k
in seen_pkgs
])
264 s
= "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
265 s
= s
% (seen_pkgs
[p
], deptype
, dep
)
268 num_rels
= random
.randrange(PKG_RELS
[0], PKG_RELS
[1])
269 for i
in range(0, num_deps
):
270 rel
= random
.choice([k
for k
in seen_pkgs
])
271 reltype
= random
.randrange(1, 4)
272 s
= "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
273 s
= s
% (seen_pkgs
[p
], reltype
, rel
)
276 num_sources
= random
.randrange(PKG_SRC
[0], PKG_SRC
[1])
277 for i
in range(num_sources
):
278 src_file
= user_keys
[random
.randrange(0, len(user_keys
))]
279 src
= "%s%s.%s/%s/%s-%s.tar.gz" % (
280 RANDOM_URL
[random
.randrange(0,len(RANDOM_URL
))],
281 p
, RANDOM_TLDS
[random
.randrange(0,len(RANDOM_TLDS
))],
282 RANDOM_LOCS
[random
.randrange(0,len(RANDOM_LOCS
))],
283 src_file
, genVersion())
284 s
= "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
285 s
= s
% (seen_pkgs
[p
], src
)
288 # Create trusted user proposals
290 log
.debug("Creating SQL statements for trusted user proposals.")
292 for t
in range(0, OPEN_PROPOSALS
+CLOSE_PROPOSALS
):
293 now
= int(time
.time())
294 if count
< CLOSE_PROPOSALS
:
295 start
= now
- random
.randrange(3600*24*7, 3600*24*21)
296 end
= now
- random
.randrange(0, 3600*24*7)
299 end
= now
+ random
.randrange(3600*24, 3600*24*7)
300 if count
% 5 == 0: # Don't make the vote about anyone once in a while
303 user
= user_keys
[random
.randrange(0,len(user_keys
))]
304 suid
= trustedusers
[random
.randrange(0,len(trustedusers
))]
305 s
= ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
306 " SubmitterID) VALUES ('%s', '%s', %d, %d, %d);\n")
307 s
= s
% (genFortune(), user
, start
, end
, suid
)
313 out
.write("COMMIT;\n")