David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 1 | # Copyright (c) 2015, Google Inc. |
| 2 | # |
| 3 | # Permission to use, copy, modify, and/or distribute this software for any |
| 4 | # purpose with or without fee is hereby granted, provided that the above |
| 5 | # copyright notice and this permission notice appear in all copies. |
| 6 | # |
| 7 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 8 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 9 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
| 10 | # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 11 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
| 12 | # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
| 13 | # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 14 | |
| 15 | """Extracts archives.""" |
| 16 | |
| 17 | |
David Benjamin | 0ec5639 | 2016-12-01 23:48:00 -0500 | [diff] [blame] | 18 | import hashlib |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 19 | import optparse |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 20 | import os |
| 21 | import os.path |
| 22 | import tarfile |
| 23 | import shutil |
| 24 | import sys |
| 25 | import zipfile |
| 26 | |
| 27 | |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 28 | def CheckedJoin(output, path): |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 29 | """ |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 30 | CheckedJoin returns os.path.join(output, path). It does sanity checks to |
| 31 | ensure the resulting path is under output, but shouldn't be used on untrusted |
| 32 | input. |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 33 | """ |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 34 | path = os.path.normpath(path) |
| 35 | if os.path.isabs(path) or path.startswith('.'): |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 36 | raise ValueError(path) |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 37 | return os.path.join(output, path) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 38 | |
| 39 | |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 40 | class FileEntry(object): |
| 41 | def __init__(self, path, mode, fileobj): |
| 42 | self.path = path |
| 43 | self.mode = mode |
| 44 | self.fileobj = fileobj |
| 45 | |
| 46 | |
| 47 | class SymlinkEntry(object): |
| 48 | def __init__(self, path, mode, target): |
| 49 | self.path = path |
| 50 | self.mode = mode |
| 51 | self.target = target |
| 52 | |
| 53 | |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 54 | def IterateZip(path): |
| 55 | """ |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 56 | IterateZip opens the zip file at path and returns a generator of entry objects |
| 57 | for each file in it. |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 58 | """ |
| 59 | with zipfile.ZipFile(path, 'r') as zip_file: |
| 60 | for info in zip_file.infolist(): |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 61 | if info.filename.endswith('/'): |
| 62 | continue |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 63 | yield FileEntry(info.filename, None, zip_file.open(info)) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 64 | |
| 65 | |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 66 | def IterateTar(path, compression): |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 67 | """ |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 68 | IterateTar opens the tar.gz or tar.bz2 file at path and returns a generator of |
| 69 | entry objects for each file in it. |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 70 | """ |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 71 | with tarfile.open(path, 'r:' + compression) as tar_file: |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 72 | for info in tar_file: |
| 73 | if info.isdir(): |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 74 | pass |
| 75 | elif info.issym(): |
| 76 | yield SymlinkEntry(info.name, None, info.linkname) |
| 77 | elif info.isfile(): |
| 78 | yield FileEntry(info.name, info.mode, tar_file.extractfile(info)) |
| 79 | else: |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 80 | raise ValueError('Unknown entry type "%s"' % (info.name, )) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 81 | |
| 82 | |
| 83 | def main(args): |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 84 | parser = optparse.OptionParser(usage='Usage: %prog ARCHIVE OUTPUT') |
| 85 | parser.add_option('--no-prefix', dest='no_prefix', action='store_true', |
| 86 | help='Do not remove a prefix from paths in the archive.') |
| 87 | options, args = parser.parse_args(args) |
| 88 | |
| 89 | if len(args) != 2: |
| 90 | parser.print_help() |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 91 | return 1 |
| 92 | |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 93 | archive, output = args |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 94 | |
| 95 | if not os.path.exists(archive): |
| 96 | # Skip archives that weren't downloaded. |
| 97 | return 0 |
| 98 | |
David Benjamin | 17c38b3 | 2021-10-26 16:27:56 -0400 | [diff] [blame] | 99 | with open(archive, 'rb') as f: |
David Benjamin | 0ec5639 | 2016-12-01 23:48:00 -0500 | [diff] [blame] | 100 | sha256 = hashlib.sha256() |
| 101 | while True: |
| 102 | chunk = f.read(1024 * 1024) |
| 103 | if not chunk: |
| 104 | break |
| 105 | sha256.update(chunk) |
| 106 | digest = sha256.hexdigest() |
| 107 | |
| 108 | stamp_path = os.path.join(output, ".boringssl_archive_digest") |
| 109 | if os.path.exists(stamp_path): |
| 110 | with open(stamp_path) as f: |
| 111 | if f.read().strip() == digest: |
David Benjamin | 17c38b3 | 2021-10-26 16:27:56 -0400 | [diff] [blame] | 112 | print("Already up-to-date.") |
David Benjamin | 0ec5639 | 2016-12-01 23:48:00 -0500 | [diff] [blame] | 113 | return 0 |
| 114 | |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 115 | if archive.endswith('.zip'): |
| 116 | entries = IterateZip(archive) |
| 117 | elif archive.endswith('.tar.gz'): |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 118 | entries = IterateTar(archive, 'gz') |
| 119 | elif archive.endswith('.tar.bz2'): |
| 120 | entries = IterateTar(archive, 'bz2') |
David Benjamin | ec85d0d | 2022-02-23 12:05:27 -0500 | [diff] [blame] | 121 | elif archive.endswith('.tar.xz'): |
| 122 | entries = IterateTar(archive, 'xz') |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 123 | else: |
| 124 | raise ValueError(archive) |
| 125 | |
| 126 | try: |
| 127 | if os.path.exists(output): |
David Benjamin | 17c38b3 | 2021-10-26 16:27:56 -0400 | [diff] [blame] | 128 | print("Removing %s" % (output, )) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 129 | shutil.rmtree(output) |
| 130 | |
David Benjamin | 17c38b3 | 2021-10-26 16:27:56 -0400 | [diff] [blame] | 131 | print("Extracting %s to %s" % (archive, output)) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 132 | prefix = None |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 133 | num_extracted = 0 |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 134 | for entry in entries: |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 135 | # Even on Windows, zip files must always use forward slashes. |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 136 | if '\\' in entry.path or entry.path.startswith('/'): |
| 137 | raise ValueError(entry.path) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 138 | |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 139 | if not options.no_prefix: |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 140 | new_prefix, rest = entry.path.split('/', 1) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 141 | |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 142 | # Ensure the archive is consistent. |
| 143 | if prefix is None: |
| 144 | prefix = new_prefix |
| 145 | if prefix != new_prefix: |
| 146 | raise ValueError((prefix, new_prefix)) |
| 147 | else: |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 148 | rest = entry.path |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 149 | |
| 150 | # Extract the file into the output directory. |
| 151 | fixed_path = CheckedJoin(output, rest) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 152 | if not os.path.isdir(os.path.dirname(fixed_path)): |
| 153 | os.makedirs(os.path.dirname(fixed_path)) |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 154 | if isinstance(entry, FileEntry): |
| 155 | with open(fixed_path, 'wb') as out: |
| 156 | shutil.copyfileobj(entry.fileobj, out) |
| 157 | elif isinstance(entry, SymlinkEntry): |
| 158 | os.symlink(entry.target, fixed_path) |
| 159 | else: |
| 160 | raise TypeError('unknown entry type') |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 161 | |
| 162 | # Fix up permissions if needbe. |
| 163 | # TODO(davidben): To be extra tidy, this should only track the execute bit |
| 164 | # as in git. |
David Benjamin | edafe47 | 2017-05-11 15:27:01 -0400 | [diff] [blame] | 165 | if entry.mode is not None: |
| 166 | os.chmod(fixed_path, entry.mode) |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 167 | |
| 168 | # Print every 100 files, so bots do not time out on large archives. |
| 169 | num_extracted += 1 |
| 170 | if num_extracted % 100 == 0: |
David Benjamin | 17c38b3 | 2021-10-26 16:27:56 -0400 | [diff] [blame] | 171 | print("Extracted %d files..." % (num_extracted,)) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 172 | finally: |
| 173 | entries.close() |
| 174 | |
David Benjamin | 0ec5639 | 2016-12-01 23:48:00 -0500 | [diff] [blame] | 175 | with open(stamp_path, 'w') as f: |
| 176 | f.write(digest) |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 177 | |
David Benjamin | 17c38b3 | 2021-10-26 16:27:56 -0400 | [diff] [blame] | 178 | print("Done. Extracted %d files." % (num_extracted,)) |
David Benjamin | 1b5cfb5 | 2015-02-13 18:38:43 -0500 | [diff] [blame] | 179 | return 0 |
| 180 | |
| 181 | |
| 182 | if __name__ == '__main__': |
David Benjamin | 0d5e080 | 2015-02-27 17:23:16 -0500 | [diff] [blame] | 183 | sys.exit(main(sys.argv[1:])) |