config/optimizejars.py

   1 # This Source Code Form is subject to the terms of the Mozilla Public
   2 # License, v. 2.0. If a copy of the MPL was not distributed with this
   3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   4
   5 import sys, os, subprocess, struct, re
   6
   7 local_file_header = [
   8     ("signature", "uint32"),
   9     ("min_version", "uint16"),
  10     ("general_flag", "uint16"),
  11     ("compression", "uint16"),
  12     ("lastmod_time", "uint16"),
  13     ("lastmod_date", "uint16"),
  14     ("crc32", "uint32"),
  15     ("compressed_size", "uint32"),
  16     ("uncompressed_size", "uint32"),
  17     ("filename_size", "uint16"),
  18     ("extra_field_size", "uint16"),
  19     ("filename", "filename_size"),
  20     ("extra_field", "extra_field_size"),
  21     ("data", "compressed_size")
  22 ]
  23
  24 cdir_entry = [
  25     ("signature", "uint32"),
  26     ("creator_version", "uint16"),
  27     ("min_version", "uint16"),
  28     ("general_flag", "uint16"),
  29     ("compression", "uint16"),
  30     ("lastmod_time", "uint16"),
  31     ("lastmod_date", "uint16"),
  32     ("crc32", "uint32"),
  33     ("compressed_size", "uint32"),
  34     ("uncompressed_size", "uint32"),
  35     ("filename_size", "uint16"),
  36     ("extrafield_size", "uint16"),
  37     ("filecomment_size", "uint16"),
  38     ("disknum", "uint16"),
  39     ("internal_attr", "uint16"),
  40     ("external_attr", "uint32"),
  41     ("offset", "uint32"),
  42     ("filename", "filename_size"),
  43     ("extrafield", "extrafield_size"),
  44     ("filecomment", "filecomment_size"),
  45 ]
  46
  47 cdir_end = [
  48     ("signature", "uint32"),
  49     ("disk_num", "uint16"),
  50     ("cdir_disk", "uint16"),
  51     ("disk_entries", "uint16"),
  52     ("cdir_entries", "uint16"),
  53     ("cdir_size", "uint32"),
  54     ("cdir_offset", "uint32"),
  55     ("comment_size", "uint16"),
  56 ]
  57
  58 type_mapping = { "uint32":"I", "uint16":"H"}
  59
  60 def format_struct (format):
  61     string_fields = {}
  62     fmt = "<"
  63     for (name,value) in iter(format):
  64         try:
  65             fmt += type_mapping[value][0]
  66         except KeyError:
  67             string_fields[name] = value
  68     return (fmt, string_fields)
  69
  70 def size_of(format):
  71     return struct.calcsize(format_struct(format)[0])
  72
  73 class MyStruct:
  74     def __init__(self, format, string_fields):
  75         self.__dict__["struct_members"] = {}
  76         self.__dict__["format"] = format
  77         self.__dict__["string_fields"] = string_fields
  78
  79     def addMember(self, name, value):
  80         self.__dict__["struct_members"][name] = value
  81
  82     def __getattr__(self, item):
  83         try:
  84             return self.__dict__["struct_members"][item]
  85         except:
  86             pass
  87         print("no %s" %item)
  88         print(self.__dict__["struct_members"])
  89         raise AttributeError
  90
  91     def __setattr__(self, item, value):
  92         if item in self.__dict__["struct_members"]:
  93             self.__dict__["struct_members"][item] = value
  94         else:
  95             raise AttributeError
  96
  97     def pack(self):
  98         extra_data = ""
  99         values = []
 100         string_fields = self.__dict__["string_fields"]
 101         struct_members = self.__dict__["struct_members"]
 102         format = self.__dict__["format"]
 103         for (name,_) in format:
 104             if name in string_fields:
 105                 extra_data = extra_data + struct_members[name]
 106             else:
 107                 values.append(struct_members[name]);
 108         return struct.pack(format_struct(format)[0], *values) + extra_data
 109
 110 ENDSIG = 0x06054b50
 111
 112 def assert_true(cond, msg):
 113     if not cond:
 114         raise Exception(msg)
 115         exit(1)
 116
 117 class BinaryBlob:
 118     def __init__(self, f):
 119        self.data = open(f, "rb").read()
 120        self.offset = 0
 121        self.length = len(self.data)
 122
 123     def readAt(self, pos, length):
 124         self.offset = pos + length
 125         return self.data[pos:self.offset]
 126
 127     def read_struct (self, format, offset = None):
 128         if offset == None:
 129             offset = self.offset
 130         (fstr, string_fields) = format_struct(format)
 131         size = struct.calcsize(fstr)
 132         data = self.readAt(offset, size)
 133         ret = struct.unpack(fstr, data)
 134         retstruct = MyStruct(format, string_fields)
 135         i = 0
 136         for (name,_) in iter(format):
 137             member_desc = None
 138             if not name in string_fields:
 139                 member_data = ret[i]
 140                 i = i + 1
 141             else:
 142                 # zip has data fields which are described by other struct fields, this does
 143                 # additional reads to fill em in
 144                 member_desc = string_fields[name]
 145                 member_data = self.readAt(self.offset, retstruct.__getattr__(member_desc))
 146             retstruct.addMember(name, member_data)
 147         # sanity check serialization code
 148         data = self.readAt(offset, self.offset - offset)
 149         out_data = retstruct.pack()
 150         assert_true(out_data == data, "Serialization fail %d !=%d"% (len(out_data), len(data)))
 151         return retstruct
 152
 153 def optimizejar(jar, outjar, inlog = None):
 154     if inlog is not None:
 155         inlog = open(inlog).read().rstrip()
 156         # in the case of an empty log still move the index forward
 157         if len(inlog) == 0:
 158             inlog = []
 159         else:
 160             inlog = inlog.split("\n")
 161     outlog = []
 162     jarblob = BinaryBlob(jar)
 163     dirend = jarblob.read_struct(cdir_end, jarblob.length - size_of(cdir_end))
 164     assert_true(dirend.signature == ENDSIG, "no signature in the end");
 165     cdir_offset = dirend.cdir_offset
 166     readahead = 0
 167     if inlog is None and cdir_offset == 4:
 168         readahead = struct.unpack("<I", jarblob.readAt(0, 4))[0]
 169         print("%s: startup data ends at byte %d" % (outjar, readahead));
 170
 171     total_stripped = 0;
 172     jarblob.offset = cdir_offset
 173     central_directory = []
 174     for i in range(0, dirend.cdir_entries):
 175         entry = jarblob.read_struct(cdir_entry)
 176         if entry.filename[-1:] == "/":
 177             total_stripped += len(entry.pack())
 178         else:
 179             total_stripped += entry.extrafield_size
 180         central_directory.append(entry)
 181
 182     reordered_count = 0
 183     if inlog is not None:
 184         dup_guard = set()
 185         for ordered_name in inlog:
 186             if ordered_name in dup_guard:
 187                 continue
 188             else:
 189                 dup_guard.add(ordered_name)
 190             found = False
 191             for i in range(reordered_count, len(central_directory)):
 192                 if central_directory[i].filename == ordered_name:
 193                     # swap the cdir entries
 194                     tmp = central_directory[i]
 195                     central_directory[i] = central_directory[reordered_count]
 196                     central_directory[reordered_count] = tmp
 197                     reordered_count = reordered_count + 1
 198                     found = True
 199                     break
 200             if not found:
 201                 print( "Can't find '%s' in %s" % (ordered_name, jar))
 202
 203     outfd = open(outjar, "wb")
 204     out_offset = 0
 205     if inlog is not None:
 206         # have to put central directory at offset 4 cos 0 confuses some tools.
 207         # This also lets us specify how many entries should be preread
 208         dirend.cdir_offset = 4
 209         # make room for central dir + end of dir + 4 extra bytes at front
 210         out_offset = dirend.cdir_offset + dirend.cdir_size + size_of(cdir_end) - total_stripped
 211         outfd.seek(out_offset)
 212
 213     cdir_data = ""
 214     written_count = 0
 215     crc_mapping = {}
 216     dups_found = 0
 217     dupe_bytes = 0
 218     # store number of bytes suggested for readahead
 219     for entry in central_directory:
 220         # read in the header twice..first for comparison, second time for convenience when writing out
 221         jarfile = jarblob.read_struct(local_file_header, entry.offset)
 222         assert_true(jarfile.filename == entry.filename, "Directory/Localheader mismatch")
 223         # drop directory entries
 224         if entry.filename[-1:] == "/":
 225             total_stripped += len(jarfile.pack())
 226             dirend.cdir_entries -= 1
 227             continue
 228         # drop extra field data
 229         else:
 230             total_stripped += jarfile.extra_field_size;
 231         entry.extrafield = jarfile.extra_field = ""
 232         entry.extrafield_size = jarfile.extra_field_size = 0
 233         # January 1st, 2010
 234         entry.lastmod_date = jarfile.lastmod_date = ((2010 - 1980) << 9) | (1 << 5) | 1
 235         entry.lastmod_time = jarfile.lastmod_time = 0
 236         data = jarfile.pack()
 237         outfd.write(data)
 238         old_entry_offset = entry.offset
 239         entry.offset = out_offset
 240         out_offset = out_offset + len(data)
 241         entry_data = entry.pack()
 242         cdir_data += entry_data
 243         expected_len = entry.filename_size + entry.extrafield_size + entry.filecomment_size
 244         assert_true(len(entry_data) != expected_len,
 245                     "%s entry size - expected:%d got:%d" % (entry.filename, len(entry_data), expected_len))
 246         written_count += 1
 247
 248         if entry.crc32 in crc_mapping:
 249             dups_found += 1
 250             dupe_bytes += entry.compressed_size + len(data) + len(entry_data)
 251             print("%s\n\tis a duplicate of\n%s\n---"%(entry.filename, crc_mapping[entry.crc32]))
 252         else:
 253             crc_mapping[entry.crc32] = entry.filename;
 254
 255         if inlog is not None:
 256             if written_count == reordered_count:
 257                 readahead = out_offset
 258                 print("%s: startup data ends at byte %d"%( outjar, readahead));
 259             elif written_count < reordered_count:
 260                 pass
 261                 #print("%s @ %d" % (entry.filename, out_offset))
 262         elif readahead >= old_entry_offset + len(data):
 263             outlog.append(entry.filename)
 264             reordered_count += 1
 265
 266     if inlog is None:
 267         dirend.cdir_offset = out_offset
 268
 269     if dups_found > 0:
 270         print("WARNING: Found %d duplicate files taking %d bytes"%(dups_found, dupe_bytes))
 271
 272     dirend.cdir_size = len(cdir_data)
 273     dirend.disk_entries = dirend.cdir_entries
 274     dirend_data = dirend.pack()
 275     assert_true(size_of(cdir_end) == len(dirend_data), "Failed to serialize directory end correctly. Serialized size;%d, expected:%d"%(len(dirend_data), size_of(cdir_end)));
 276
 277     outfd.seek(dirend.cdir_offset)
 278     outfd.write(cdir_data)
 279     outfd.write(dirend_data)
 280
 281     # for ordered jars the central directory is written in the begining of the file, so a second central-directory
 282     # entry has to be written in the end of the file
 283     if inlog is not None:
 284         outfd.seek(0)
 285         outfd.write(struct.pack("<I", readahead));
 286         outfd.seek(out_offset)
 287         outfd.write(dirend_data)
 288
 289     print "Stripped %d bytes" % total_stripped
 290     print "%s %d/%d in %s" % (("Ordered" if inlog is not None else "Deoptimized"),
 291                               reordered_count, len(central_directory), outjar)
 292     outfd.close()
 293     return outlog
 294
 295 if len(sys.argv) != 5:
 296     print "Usage: --optimize|--deoptimize %s JAR_LOG_DIR IN_JAR_DIR OUT_JAR_DIR" % sys.argv[0]
 297     exit(1)
 298
 299 jar_regex = re.compile("\\.jar?$")
 300
 301 def optimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR):
 302     ls = os.listdir(IN_JAR_DIR)
 303     for jarfile in ls:
 304         if not re.search(jar_regex, jarfile):
 305             continue
 306         injarfile = os.path.join(IN_JAR_DIR, jarfile)
 307         outjarfile = os.path.join(OUT_JAR_DIR, jarfile)
 308         logfile = os.path.join(JAR_LOG_DIR, jarfile + ".log")
 309         if not os.path.isfile(logfile):
 310             logfile = None
 311         optimizejar(injarfile, outjarfile, logfile)
 312
 313 def deoptimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR):
 314     if not os.path.exists(JAR_LOG_DIR):
 315         os.makedirs(JAR_LOG_DIR)
 316
 317     ls = os.listdir(IN_JAR_DIR)
 318     for jarfile in ls:
 319         if not re.search(jar_regex, jarfile):
 320             continue
 321         injarfile = os.path.join(IN_JAR_DIR, jarfile)
 322         outjarfile = os.path.join(OUT_JAR_DIR, jarfile)
 323         logfile = os.path.join(JAR_LOG_DIR, jarfile + ".log")
 324         log = optimizejar(injarfile, outjarfile, None)
 325         open(logfile, "wb").write("\n".join(log))
 326
 327 def main():
 328     MODE = sys.argv[1]
 329     JAR_LOG_DIR = sys.argv[2]
 330     IN_JAR_DIR = sys.argv[3]
 331     OUT_JAR_DIR = sys.argv[4]
 332     if MODE == "--optimize":
 333         optimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR)
 334     elif MODE == "--deoptimize":
 335         deoptimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR)
 336     else:
 337         print("Unknown mode %s" % MODE)
 338         exit(1)
 339
 340 if __name__ == '__main__':
 341     main()