about summary refs log tree commit diff
path: root/pkgs/development/tools/build-managers/gradle/compress-deps-json.py
blob: 6397892835adf80de5c761f61743aa677c437b2c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import json
import sys

from typing import Dict, Set

# this compresses MITM URL lists with Gradle-specific optimizations
# specifically, it splits each url into up to 3 parts - they will be
# concatenated like part1/part2.part3 or part1.part2
# part3 is simply always the file extension, but part1 and part2 is
# optimized using special heuristics
# additionally, if part2 ends with /a/b/{a}-{b}, the all occurences of
# /{a}/{b}/ are replaced with #
# finally, anything that ends with = is considered SHA256, anything that
# starts with http is considered a redirect URL, anything else is
# considered text

with open(sys.argv[1], "rt") as f:
    data: dict = json.load(f)

new_data: Dict[str, Dict[str, Dict[str, dict]]] = {}

for url, info in data.items():
    if url == "!version":
        continue
    ext, base = map(lambda x: x[::-1], url[::-1].split(".", 1))
    if base.endswith(".tar"):
        base = base[:-4]
        ext = "tar." + ext
    # special logic for Maven repos
    if ext in ["jar", "pom", "module"]:
        comps = base.split("/")
        if "-" in comps[-1]:
            # convert base/name/ver/name-ver into base#name/ver

            filename = comps[-1]
            name = comps[-3]
            basever = comps[-2]
            ver = basever
            is_snapshot = ver.endswith("-SNAPSHOT")
            if is_snapshot:
                ver = ver.removesuffix("-SNAPSHOT")
            if filename.startswith(f"{name}-{ver}"):
                if is_snapshot:
                    if filename.startswith(f"{name}-{ver}-SNAPSHOT"):
                        ver += "-SNAPSHOT"
                    else:
                        ver += "-".join(
                            filename.removeprefix(f"{name}-{ver}").split("-")[:3]
                        )
                comp_end = comps[-1].removeprefix(f"{name}-{ver}")
            else:
                ver, name, comp_end = None, None, None
            if name and ver and (not comp_end or comp_end.startswith("-")):
                base = "/".join(comps[:-1]) + "/"
                base = base.replace(f"/{name}/{basever}/", "#")
                base += f"{name}/{ver}"
                if is_snapshot:
                    base += "/SNAPSHOT"
                if comp_end:
                    base += "/" + comp_end[1:]
    scheme, rest = base.split("/", 1)
    if scheme not in new_data.keys():
        new_data[scheme] = {}
    if rest not in new_data[scheme].keys():
        new_data[scheme][rest] = {}
    if "hash" in info.keys():
        new_data[scheme][rest][ext] = info["hash"]
    elif "text" in info.keys() and ext == "xml":
        # nix code in fetch-deps.nix will autogenerate metadata xml files groupId
        # is part of the URL, but it can be tricky to parse as we don't know the
        # exact repo base, so take it from the xml and pass it to nix
        xml = "".join(info["text"].split())
        new_data[scheme][rest][ext] = {
            "groupId": xml.split("<groupId>")[1].split("</groupId>")[0],
        }
        if "<release>" in xml:
            new_data[scheme][rest][ext]["release"] = xml.split("<release>")[1].split(
                "</release>"
            )[0]
        if "<latest>" in xml:
            latest = xml.split("<latest>")[1].split("</latest>")[0]
            if latest != new_data[scheme][rest][ext].get("release"):
                new_data[scheme][rest][ext]["latest"] = latest
        if "<lastUpdated>" in xml:
            new_data[scheme][rest][ext]["lastUpdated"] = xml.split("<lastUpdated>")[
                1
            ].split("</lastUpdated>")[0]
    else:
        raise Exception("Unsupported key: " + repr(info))

# At this point, we have a map by part1 (initially the scheme), part2 (initially a
# slash-separated string without the scheme and with potential # substitution as
# seen above), extension.
# Now, push some segments from "part2" into "part1" like this:
#  https # part1
#   domain1/b # part2
#   domain1/c
#   domain2/a
#   domain2/c
# ->
#  https/domain1 # part1
#   b # part2
#   c
#  https/domain2 # part1
#   a # part2
#   c
# This helps reduce the lockfile size because a Gradle project will usually use lots
# of files from a single Maven repo

data = new_data
changed = True
while changed:
    changed = False
    new_data = {}
    for part1, info1 in data.items():
        starts: Set[str] = set()
        # by how many bytes the file size will be increased (roughly)
        lose = 0
        # by how many bytes the file size will be reduced (roughly)
        win = 0
        # how many different initial part2 segments there are
        count = 0
        for part2, info2 in info1.items():
            if "/" not in part2:
                # can't push a segment from part2 into part1
                count = 0
                break
            st = part2.split("/", 1)[0]
            if st not in starts:
                lose += len(st) + 1
                count += 1
                starts.add(st)
            win += len(st) + 1
        if count == 0:
            new_data[part1] = info1
            continue
        # only allow pushing part2 segments into path1 if *either*:
        # - the domain isn't yet part of part1
        # - the initial part2 segment is always the same
        if count != 1 and "." in part1:
            new_data[part1] = info1
            continue
        # some heuristics that may or may not work well (originally this was
        # used when the above if wasn't here, but perhaps it's useless now)
        lose += (count - 1) * max(0, len(part1) - 4)
        if win > lose or ("." not in part1 and win >= lose):
            changed = True
            for part2, info2 in info1.items():
                st, part3 = part2.split("/", 1)
                new_part1 = part1 + "/" + st
                if new_part1 not in new_data.keys():
                    new_data[new_part1] = {}
                new_data[new_part1][part3] = info2
        else:
            new_data[part1] = info1
    data = new_data

new_data["!comment"] = "This is a nixpkgs Gradle dependency lockfile. For more details, refer to the Gradle section in the nixpkgs manual."  # type: ignore
new_data["!version"] = 1  # type: ignore

with open(sys.argv[2], "wt") as f:
    json.dump(new_data, f, sort_keys=True, indent=1)
    f.write("\n")