summaryrefslogtreecommitdiffstats
path: root/include/orcus/sax_parser_base.hpp
blob: 4dcfc077a726e869c2daefc837ed40fda01de904 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

#ifndef INCLUDED_ORCUS_SAX_PARSER_BASE_HPP
#define INCLUDED_ORCUS_SAX_PARSER_BASE_HPP

#include "env.hpp"
#include "cell_buffer.hpp"
#include "parser_global.hpp"
#include "parser_base.hpp"

#include <cassert>
#include <cstdlib>
#include <exception>
#include <sstream>
#include <memory>

#define ORCUS_DEBUG_SAX_PARSER 0

#if ORCUS_DEBUG_SAX_PARSER
#include <iostream>
using std::cout;
using std::endl;
#endif

namespace orcus { namespace sax {

/**
 * Document type declaration passed by sax_parser to its handler's doctype()
 * call.
 */
struct doctype_declaration
{
    enum class keyword_type { dtd_public, dtd_private };

    keyword_type keyword;
    std::string_view root_element;
    std::string_view fpi;
    std::string_view uri;
};

/**
 * Given an encoded name (such as 'quot' and 'amp'), return a single
 * character that corresponds with the name.  The name shouldn't include the
 * leading '&' and trailing ';'.
 *
 * @param p pointer to the first character of encoded name
 * @param n length of encoded name
 *
 * @return single character that corresponds with the encoded name.  '\0' is
 *         returned if decoding fails.
 */
ORCUS_PSR_DLLPUBLIC char decode_xml_encoded_char(const char* p, size_t n);

/**
 * Given an encoded unicode value (such as #20A9), return a UTF-8 string
 * that corresponds with the unicode value.  The value shouldn't include the
 * leading '&' and trailing ';'.
 *
 * @param p pointer to the first character of encoded name
 * @param n length of encoded name
 *
 * @return string that corresponds with the encoded value.  An empty string
 *         is returned if decoding fails.
 */
ORCUS_PSR_DLLPUBLIC std::string decode_xml_unicode_char(const char* p, size_t n);

/**
 * Element properties passed by sax_parser to its handler's open_element()
 * and close_element() calls.
 */
struct parser_element
{
    /** Optional element namespace. It may be empty if it's not given. */
    std::string_view ns;
    /** Element name. */
    std::string_view name;
    /** Position of the opening brace '<'. */
    std::ptrdiff_t begin_pos;
    /** Position immediately after the closing brace '>'. */
    std::ptrdiff_t end_pos;
};

/**
 * Attribute properties passed by sax_parser to its handler's attribute()
 * call. When an attribute value is "transient", it has been converted due to
 * presence of encoded character(s) and has been stored in a temporary buffer.
 * The handler must assume that the value will not survive after the callback
 * function ends.
 */
struct parser_attribute
{
    /** Optional attribute namespace.  It may be empty if it's not given. */
    std::string_view ns;
    /** Attribute name. */
    std::string_view name;
    /** Attribute value. */
    std::string_view value;
    /** Whether or not the attribute value is in a temporary buffer. */
    bool transient;
};

class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base
{
    struct impl;
    std::unique_ptr<impl> mp_impl;

    parser_base() = delete;
    parser_base(const parser_base&) = delete;
    parser_base& operator=(const parser_base&) = delete;
protected:
    size_t m_nest_level;
    size_t m_buffer_pos;
    bool m_root_elem_open:1;

protected:
    parser_base(const char* content, size_t size);
    ~parser_base();

    void next_check()
    {
        next();
        if (!has_char())
            throw malformed_xml_error("xml stream ended prematurely.", offset());
    }

    void nest_up() { ++m_nest_level; }
    void nest_down()
    {
        if (m_nest_level == 0)
            throw malformed_xml_error("incorrect nesting in xml stream", offset());

        --m_nest_level;
    }

    void inc_buffer_pos();
    void reset_buffer_pos() { m_buffer_pos = 0; }

    void has_char_throw(const char* msg) const
    {
        if (!has_char())
            throw malformed_xml_error(msg, offset());
    }

    char cur_char_checked() const
    {
        if (!has_char())
            throw malformed_xml_error("xml stream ended prematurely.", offset());

        return *mp_char;
    }

    char next_and_char()
    {
        next();
#if ORCUS_DEBUG_SAX_PARSER
        if (mp_char >= mp_end)
            throw malformed_xml_error("xml stream ended prematurely.", offset());
#endif
        return *mp_char;
    }

    char next_char_checked()
    {
        next();
        if (!has_char())
            throw malformed_xml_error("xml stream ended prematurely.", offset());

        return *mp_char;
    }

    cell_buffer& get_cell_buffer();

    void comment();

    void expects_next(const char* p, size_t n);

    void parse_encoded_char(cell_buffer& buf);
    void value_with_encoded_char(cell_buffer& buf, std::string_view& str, char quote_char);

    /**
     * Parse quoted value.  Note that the retrieved string may be stored in a
     * temporary cell buffer if the decode parameter is true. Use the string
     * immediately after this call before the buffer becomes invalid.
     *
     * @note This method checks for valid stream; the caller doesn't need to
     *       check for valid stream before calling this method.
     *
     * @return true if the value is stored in temporary buffer, false
     *         otherwise.
     */
    bool value(std::string_view& str, bool decode);

    void name(std::string_view& str);
    void element_name(parser_element& elem, std::ptrdiff_t begin_pos);
    void attribute_name(std::string_view& attr_ns, std::string_view& attr_name);
    void characters_with_encoded_char(cell_buffer& buf);
};

}}

#endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */