summaryrefslogtreecommitdiffstats
path: root/testing/web-platform/tests/tools/third_party/html5lib/utils/entities.py
blob: 6e8ca4580604825370bb4fdd5831449481a6b6a4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json

import html5lib


def parse(path="html5ents.xml"):
    return html5lib.parse(open(path), treebuilder="lxml")


def entity_table(tree):
    return {entity_name("".join(tr[0].xpath(".//text()"))):
            entity_characters(tr[1].text)
            for tr in tree.xpath("//h:tbody/h:tr",
                                 namespaces={"h": "http://www.w3.org/1999/xhtml"})}


def entity_name(inp):
    return inp.strip()


def entity_characters(inp):
    return "".join(codepoint_to_character(item)
                   for item in inp.split()
                   if item)


def codepoint_to_character(inp):
    return ("\\U000" + inp[2:]).decode("unicode-escape")


def make_tests_json(entities):
    test_list = make_test_list(entities)
    tests_json = {"tests":
                  [make_test(*item) for item in test_list]
                  }
    return tests_json


def make_test(name, characters, good):
    return {
        "description": test_description(name, good),
        "input": "&%s" % name,
        "output": test_expected(name, characters, good)
    }


def test_description(name, good):
    with_semicolon = name.endswith(";")
    semicolon_text = {True: "with a semi-colon",
                      False: "without a semi-colon"}[with_semicolon]
    if good:
        text = "Named entity: %s %s" % (name, semicolon_text)
    else:
        text = "Bad named entity: %s %s" % (name, semicolon_text)
    return text


def test_expected(name, characters, good):
    rv = []
    if not good or not name.endswith(";"):
        rv.append("ParseError")
    rv.append(["Character", characters])
    return rv


def make_test_list(entities):
    tests = []
    for entity_name, characters in entities.items():
        if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
            tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
        tests.append((entity_name, characters, True))
    return sorted(tests)


def subentity_exists(entity_name, entities):
    for i in range(1, len(entity_name)):
        if entity_name[:-i] in entities:
            return True
    return False


def make_entities_code(entities):
    entities_text = "\n".join("    \"%s\": u\"%s\"," % (
        name, entities[name].encode(
            "unicode-escape").replace("\"", "\\\""))
        for name in sorted(entities.keys()))
    return """entities = {
%s
}""" % entities_text


def main():
    entities = entity_table(parse())
    tests_json = make_tests_json(entities)
    json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
    code = make_entities_code(entities)
    open("entities_constants.py", "w").write(code)


if __name__ == "__main__":
    main()