3 usage: gendummydata.py outputfilename.sql
6 # This script seeds the AUR database with dummy data for
7 # use during development/testing. It uses random entries
8 # from /usr/share/dict/words to create user accounts and
9 # package names. It generates the SQL statements to
10 # insert these users/packages into the AUR database.
20 LOG_LEVEL
= logging
.DEBUG
# logging level. set to logging.INFO to reduce output
21 SEED_FILE
= "/usr/share/dict/words"
22 DB_HOST
= os
.getenv("DB_HOST", "localhost")
23 DB_NAME
= os
.getenv("DB_NAME", "AUR")
24 DB_USER
= os
.getenv("DB_USER", "aur")
25 DB_PASS
= os
.getenv("DB_PASS", "aur")
26 USER_ID
= 5 # Users.ID of first bogus user
27 PKG_ID
= 1 # Packages.ID of first package
28 MAX_USERS
= 300 # how many users to 'register'
29 MAX_DEVS
= .1 # what percentage of MAX_USERS are Developers
30 MAX_TUS
= .2 # what percentage of MAX_USERS are Trusted Users
31 MAX_PKGS
= 900 # how many packages to load
32 PKG_DEPS
= (1, 15) # min/max depends a package has
33 PKG_RELS
= (1, 5) # min/max relations a package has
34 PKG_SRC
= (1, 3) # min/max sources a package has
35 PKG_CMNTS
= (1, 5) # min/max number of comments a package has
36 CATEGORIES_COUNT
= 17 # the number of categories from aur-schema
37 VOTING
= (0, .30) # percentage range for package voting
38 OPEN_PROPOSALS
= 5 # number of open trusted user proposals
39 CLOSE_PROPOSALS
= 15 # number of closed trusted user proposals
40 RANDOM_TLDS
= ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
41 RANDOM_URL
= ("http://www.", "ftp://ftp.", "http://", "ftp://")
42 RANDOM_LOCS
= ("pub", "release", "files", "downloads", "src")
43 FORTUNE_FILE
= "/usr/share/fortune/cookie"
46 logformat
= "%(levelname)s: %(message)s"
47 logging
.basicConfig(format
=logformat
, level
=LOG_LEVEL
)
48 log
= logging
.getLogger()
50 if len(sys
.argv
) != 2:
51 log
.error("Missing output filename argument")
54 # make sure the seed file exists
56 if not os
.path
.exists(SEED_FILE
):
57 log
.error("Please install the 'words' Arch package")
60 # make sure comments can be created
62 if not os
.path
.exists(FORTUNE_FILE
):
63 log
.error("Please install the 'fortune-mod' Arch package")
66 # track what users/package names have been used
72 # some functions to generate random data
76 ver
.append("%d" % random
.randrange(0,10))
77 ver
.append("%d" % random
.randrange(0,20))
78 if random
.randrange(0,2) == 0:
79 ver
.append("%d" % random
.randrange(0,100))
80 return ".".join(ver
) + "-%d" % random
.randrange(1,11)
82 return random
.randrange(1,CATEGORIES_COUNT
)
84 return seen_users
[user_keys
[random
.randrange(0,len(user_keys
))]]
86 return fortunes
[random
.randrange(0,len(fortunes
))].replace("'", "")
89 # load the words, and make sure there are enough words for users/pkgs
91 log
.debug("Grabbing words from seed file...")
92 fp
= open(SEED_FILE
, "r", encoding
="utf-8")
93 contents
= fp
.readlines()
95 if MAX_USERS
> len(contents
):
96 MAX_USERS
= len(contents
)
97 if MAX_PKGS
> len(contents
):
98 MAX_PKGS
= len(contents
)
99 if len(contents
) - MAX_USERS
> MAX_PKGS
:
104 # select random usernames
106 log
.debug("Generating random user names...")
108 while len(seen_users
) < MAX_USERS
:
109 user
= random
.randrange(0, len(contents
))
110 word
= contents
[user
].replace("'", "").replace(".","").replace(" ", "_")
111 word
= word
.strip().lower()
112 if word
not in seen_users
:
113 seen_users
[word
] = user_id
115 user_keys
= list(seen_users
.keys())
117 # select random package names
119 log
.debug("Generating random package names...")
121 while len(seen_pkgs
) < MAX_PKGS
:
122 pkg
= random
.randrange(0, len(contents
))
123 word
= contents
[pkg
].replace("'", "").replace(".","").replace(" ", "_")
124 word
= word
.strip().lower()
126 if word
not in seen_pkgs
and word
not in seen_users
:
127 seen_pkgs
[word
] = num_pkgs
130 if word
not in seen_pkgs
:
131 seen_pkgs
[word
] = num_pkgs
134 # free up contents memory
145 # Just let python throw the errors if any happen
147 out
= open(sys
.argv
[1], "w", encoding
="utf-8")
148 out
.write("BEGIN;\n")
150 # Begin by creating the User statements
152 log
.debug("Creating SQL statements for users.")
154 account_type
= 1 # default to normal user
155 if not has_devs
or not has_tus
:
156 account_type
= random
.randrange(1, 4)
157 if account_type
== 3 and not has_devs
:
158 # this will be a dev account
160 developers
.append(seen_users
[u
])
161 if len(developers
) >= MAX_DEVS
* MAX_USERS
:
163 elif account_type
== 2 and not has_tus
:
164 # this will be a trusted user account
166 trustedusers
.append(seen_users
[u
])
167 if len(trustedusers
) >= MAX_TUS
* MAX_USERS
:
170 # a normal user account
174 h
= hashlib
.new('md5')
175 h
.update(u
.encode());
176 s
= ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
177 " VALUES (%d, %d, '%s', '%s@example.com', '%s');\n")
178 s
= s
% (seen_users
[u
], account_type
, u
, u
, h
.hexdigest())
181 log
.debug("Number of developers: %d" % len(developers
))
182 log
.debug("Number of trusted users: %d" % len(trustedusers
))
183 log
.debug("Number of users: %d" % (MAX_USERS
-len(developers
)-len(trustedusers
)))
184 log
.debug("Number of packages: %d" % MAX_PKGS
)
186 log
.debug("Gathering text from fortune file...")
187 fp
= open(FORTUNE_FILE
, "r", encoding
="utf-8")
188 fortunes
= fp
.read().split("%\n")
191 # Create the package statements
193 log
.debug("Creating SQL statements for packages.")
195 for p
in list(seen_pkgs
.keys()):
196 NOW
= int(time
.time())
198 muid
= developers
[random
.randrange(0,len(developers
))]
199 puid
= developers
[random
.randrange(0,len(developers
))]
201 muid
= trustedusers
[random
.randrange(0,len(trustedusers
))]
202 puid
= trustedusers
[random
.randrange(0,len(trustedusers
))]
203 if count
% 20 == 0: # every so often, there are orphans...
206 uuid
= genUID() # the submitter/user
208 s
= ("INSERT INTO PackageBases (ID, Name, SubmittedTS, ModifiedTS, "
209 "SubmitterUID, MaintainerUID, PackagerUID) VALUES (%d, '%s', %d, %d, %d, %s, %s);\n")
210 s
= s
% (seen_pkgs
[p
], p
, NOW
, NOW
, uuid
, muid
, puid
)
213 s
= ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
214 "(%d, %d, '%s', '%s');\n")
215 s
= s
% (seen_pkgs
[p
], seen_pkgs
[p
], p
, genVersion())
220 # create random comments for this package
222 num_comments
= random
.randrange(PKG_CMNTS
[0], PKG_CMNTS
[1])
223 for i
in range(0, num_comments
):
224 now
= NOW
+ random
.randrange(400, 86400*3)
225 s
= ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
226 " Comments, CommentTS) VALUES (%d, %d, '%s', %d);\n")
227 s
= s
% (seen_pkgs
[p
], genUID(), genFortune(), now
)
233 log
.debug("Casting votes for packages.")
235 num_votes
= random
.randrange(int(len(seen_pkgs
)*VOTING
[0]),
236 int(len(seen_pkgs
)*VOTING
[1]))
238 for v
in range(num_votes
):
239 pkg
= random
.randrange(1, len(seen_pkgs
) + 1)
240 if pkg
not in pkgvote
:
241 s
= ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
242 " VALUES (%d, %d);\n")
243 s
= s
% (seen_users
[u
], pkg
)
245 if pkg
not in track_votes
:
247 track_votes
[pkg
] += 1
250 # Update statements for package votes
252 for p
in list(track_votes
.keys()):
253 s
= "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
254 s
= s
% (track_votes
[p
], p
)
257 # Create package dependencies and sources
259 log
.debug("Creating statements for package depends/sources.")
260 for p
in list(seen_pkgs
.keys()):
261 num_deps
= random
.randrange(PKG_DEPS
[0], PKG_DEPS
[1])
262 for i
in range(0, num_deps
):
263 dep
= random
.choice([k
for k
in seen_pkgs
])
264 deptype
= random
.randrange(1, 5)
266 dep
+= ": for " + random
.choice([k
for k
in seen_pkgs
])
267 s
= "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
268 s
= s
% (seen_pkgs
[p
], deptype
, dep
)
271 num_rels
= random
.randrange(PKG_RELS
[0], PKG_RELS
[1])
272 for i
in range(0, num_deps
):
273 rel
= random
.choice([k
for k
in seen_pkgs
])
274 reltype
= random
.randrange(1, 4)
275 s
= "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
276 s
= s
% (seen_pkgs
[p
], reltype
, rel
)
279 num_sources
= random
.randrange(PKG_SRC
[0], PKG_SRC
[1])
280 for i
in range(num_sources
):
281 src_file
= user_keys
[random
.randrange(0, len(user_keys
))]
282 src
= "%s%s.%s/%s/%s-%s.tar.gz" % (
283 RANDOM_URL
[random
.randrange(0,len(RANDOM_URL
))],
284 p
, RANDOM_TLDS
[random
.randrange(0,len(RANDOM_TLDS
))],
285 RANDOM_LOCS
[random
.randrange(0,len(RANDOM_LOCS
))],
286 src_file
, genVersion())
287 s
= "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
288 s
= s
% (seen_pkgs
[p
], src
)
291 # Create trusted user proposals
293 log
.debug("Creating SQL statements for trusted user proposals.")
295 for t
in range(0, OPEN_PROPOSALS
+CLOSE_PROPOSALS
):
296 now
= int(time
.time())
297 if count
< CLOSE_PROPOSALS
:
298 start
= now
- random
.randrange(3600*24*7, 3600*24*21)
299 end
= now
- random
.randrange(0, 3600*24*7)
302 end
= now
+ random
.randrange(3600*24, 3600*24*7)
303 if count
% 5 == 0: # Don't make the vote about anyone once in a while
306 user
= user_keys
[random
.randrange(0,len(user_keys
))]
307 suid
= trustedusers
[random
.randrange(0,len(trustedusers
))]
308 s
= ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
309 " Quorum, SubmitterID) VALUES ('%s', '%s', %d, %d, 0.0, %d);\n")
310 s
= s
% (genFortune(), user
, start
, end
, suid
)
316 out
.write("COMMIT;\n")