src/python/orcus/tools/bugzilla.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

########################################################################
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
########################################################################

import argparse
import requests
import json
import os
import base64
import traceback
import concurrent.futures as cf
from pathlib import Path
from urllib.parse import urlparse


class BugzillaAccess:
    """Encapsulates access to a bugzilla server by using its REST API.

    Args:
        bzurl (str): URL to the bugzilla server.
        cache_dir (:obj:`pathlib.Path`): path to the cache directory.
    """

    def __init__(self, bzurl, cache_dir):
        self._bzurl = bzurl
        self._cache_dir = cache_dir
        os.makedirs(self._cache_dir, exist_ok=True)

    def _get_cache_content(self, cache_file, func_fetch):
        if os.path.isfile(cache_file):
            with open(cache_file, 'r') as f:
                return f.read()

        s = func_fetch()
        with open(cache_file, 'w') as f:
            f.write(s)

        return s

    def get_bug_ids(self, bz_params):
        """Get all bug ID's for specified bugzilla query parameters.

        Args:
            bz_params (dict):
                dictionary containing all search parameters. Each search term
                must form a single key-value pair.

        Returns (:obj:`list` of :obj:`str`):
            list of bug ID strings.
        """

        def _fetch():
            r = requests.get(
                f"{self._bzurl}/rest/bug",
                params=bz_params
            )

            if r.status_code != 200:
                raise RuntimeError(f"failed to query bug ids from the TDF bugzilla! (status:{r.status_code})")
            return r.text

        escape_chars = " /"
        buf = []
        for key in bz_params.keys():
            v = str(bz_params[key])
            for c in escape_chars:
                v = v.replace(c, '-')
            buf.append(key)
            buf.append(v)

        cache_file = '-'.join(buf) + ".json"
        cache_file = self._cache_dir / cache_file
        s = self._get_cache_content(cache_file, _fetch)

        content = json.loads(s)
        bugs = content.get("bugs")
        if not bugs:
            return []

        bug_ids = [bug.get("id") for bug in bugs]
        bug_ids = [x for x in filter(None, bug_ids)]
        return bug_ids

    def get_attachments(self, bug_id):
        """Fetch all attachments for specified bug."""

        def _fetch():
            r = requests.get(f"{self._bzurl}/rest/bug/{bug_id}/attachment")
            if r.status_code != 200:
                raise RuntimeError(
                    f"failed to fetch the attachments for bug {bug_id}! (status:{r.status_code})")
            return r.text

        cache_file = self._cache_dir / f"attachments-{bug_id}.json"
        s = self._get_cache_content(cache_file, _fetch)
        content = json.loads(s)
        attachments = list()
        for d in content["bugs"][str(bug_id)]:
            data = d["data"]
            if not data:
                continue
            bytes = base64.b64decode(data)
            attachments.append({
                "content_type": d["content_type"],
                "filename": d["file_name"],
                "data": bytes
            })
        return attachments


def parse_query_params(queries):
    bz_params = dict()
    for query in queries:
        k, v = query.split('=')
        if v and v[0] in ('"', "'"):
            if v[0] != v[-1]:
                raise argparse.ArgumentError(f"mis-matched quotes in {query}")
            v = v[1:-1]
        bz_params[k] = v
    return bz_params


def _create_argparser():
    parser = argparse.ArgumentParser(
        description="""This command allows you to download attachments from a
bugzilla server that supports REST API.""")
    parser.add_argument(
        "--outdir", "-o", type=str, required=True,
        help="""output directory for downloaded files. Downloaded files are
grouped by their respective bug ID's.""")
    parser.add_argument(
        "--limit", type=int, default=50,
        help="number of bugs to include in a single set of search results.")
    parser.add_argument(
        "--offset", type=int, default=0,
        help="number of bugs to skip in the search results.")
    parser.add_argument(
        "--cont", action="store_true", default=False,
        help="""when specified, the search continues after the initial batch
is returned, by retrieving the next batch of results until the entire search
results are returned. The number specified by the ``--limit`` option is used
as the batch size.""")
    parser.add_argument(
        "--worker", type=int, default=8,
        help="number of worker threads to use for parallel downloads of files.")
    parser.add_argument(
        "--cache-dir", type=Path, default=Path(".bugzilla"),
        help="""directory to keep downloaded bugzilla search results. The
command will not send the query request to the remote server when the results
are cached. You may want to delete the cache directory after you are finished.""")
    parser.add_argument(
        "--url", type=str, required=True,
        help="""base URL for bugzilla service. It must begin with the
``http(s)://`` prefix.""")
    parser.add_argument(
        "query", type=str, nargs='*',
        help="""One or more query term to use to limit your search. Each query
term must be in the form key=value. You need to quote the value string when the
value string contains whitespace character i.e. key="value with space".""")
    return parser


def main():
    parser = _create_argparser()
    args = parser.parse_args()

    bz_params = parse_query_params(args.query)

    for k, v in bz_params.items():
        print(f"{k}: {v}")

    bz_params["limit"] = args.limit
    bz_params["offset"] = args.offset

    url = urlparse(args.url)
    cache_dir = Path(args.cache_dir) / url.netloc
    bz = BugzillaAccess(args.url, cache_dir)

    def _run(bug_id, index, totals):
        """Top-level function for each worker thread."""
        width = len(str(totals))
        index_s = str(index+1)
        index_s = ' ' * (width - len(index_s)) + index_s
        print(f"({index_s}/{totals}) fetching attachments for bug {bug_id} ...", flush=True)

        try:
            attachments = bz.get_attachments(bug_id)
            for attachment in attachments:
                filepath = Path(args.outdir) / url.netloc / str(bug_id) / attachment["filename"]
                os.makedirs(os.path.dirname(filepath), exist_ok=True)
                with open(filepath, "wb") as f:
                    f.write(attachment["data"])
        except Exception as e:
            traceback.print_exc()
            print(e)

    iter_count = 0
    while True:
        bug_ids = bz.get_bug_ids(bz_params)
        if not bug_ids:
            return
        print(f"-- iteration {iter_count+1}", flush=True)
        with cf.ThreadPoolExecutor(max_workers=args.worker) as executor:
            for i, bug_id in enumerate(bug_ids):
                executor.submit(_run, bug_id, i, len(bug_ids))

        if not args.cont:
            return

        bz_params["offset"] += bz_params["limit"]
        iter_count += 1


if __name__ == "__main__":
    main()