summaryrefslogtreecommitdiffstats
path: root/include/orcus/sax_parser_base.hpp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:48:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:48:59 +0000
commitc484829272cd13a738e35412498e12f2c9a194ac (patch)
treea1f5ec09629ee895bd3963fa8820b45f2f4c574b /include/orcus/sax_parser_base.hpp
parentInitial commit. (diff)
downloadliborcus-upstream.tar.xz
liborcus-upstream.zip
Adding upstream version 0.19.2.upstream/0.19.2upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'include/orcus/sax_parser_base.hpp')
-rw-r--r--include/orcus/sax_parser_base.hpp207
1 files changed, 207 insertions, 0 deletions
diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp
new file mode 100644
index 0000000..4dcfc07
--- /dev/null
+++ b/include/orcus/sax_parser_base.hpp
@@ -0,0 +1,207 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#ifndef INCLUDED_ORCUS_SAX_PARSER_BASE_HPP
+#define INCLUDED_ORCUS_SAX_PARSER_BASE_HPP
+
+#include "env.hpp"
+#include "cell_buffer.hpp"
+#include "parser_global.hpp"
+#include "parser_base.hpp"
+
+#include <cassert>
+#include <cstdlib>
+#include <exception>
+#include <sstream>
+#include <memory>
+
+#define ORCUS_DEBUG_SAX_PARSER 0
+
+#if ORCUS_DEBUG_SAX_PARSER
+#include <iostream>
+using std::cout;
+using std::endl;
+#endif
+
+namespace orcus { namespace sax {
+
+/**
+ * Document type declaration passed by sax_parser to its handler's doctype()
+ * call.
+ */
+struct doctype_declaration
+{
+ enum class keyword_type { dtd_public, dtd_private };
+
+ keyword_type keyword;
+ std::string_view root_element;
+ std::string_view fpi;
+ std::string_view uri;
+};
+
+/**
+ * Given an encoded name (such as 'quot' and 'amp'), return a single
+ * character that corresponds with the name. The name shouldn't include the
+ * leading '&' and trailing ';'.
+ *
+ * @param p pointer to the first character of encoded name
+ * @param n length of encoded name
+ *
+ * @return single character that corresponds with the encoded name. '\0' is
+ * returned if decoding fails.
+ */
+ORCUS_PSR_DLLPUBLIC char decode_xml_encoded_char(const char* p, size_t n);
+
+/**
+ * Given an encoded unicode value (such as #20A9), return a UTF-8 string
+ * that corresponds with the unicode value. The value shouldn't include the
+ * leading '&' and trailing ';'.
+ *
+ * @param p pointer to the first character of encoded name
+ * @param n length of encoded name
+ *
+ * @return string that corresponds with the encoded value. An empty string
+ * is returned if decoding fails.
+ */
+ORCUS_PSR_DLLPUBLIC std::string decode_xml_unicode_char(const char* p, size_t n);
+
+/**
+ * Element properties passed by sax_parser to its handler's open_element()
+ * and close_element() calls.
+ */
+struct parser_element
+{
+ /** Optional element namespace. It may be empty if it's not given. */
+ std::string_view ns;
+ /** Element name. */
+ std::string_view name;
+ /** Position of the opening brace '<'. */
+ std::ptrdiff_t begin_pos;
+ /** Position immediately after the closing brace '>'. */
+ std::ptrdiff_t end_pos;
+};
+
+/**
+ * Attribute properties passed by sax_parser to its handler's attribute()
+ * call. When an attribute value is "transient", it has been converted due to
+ * presence of encoded character(s) and has been stored in a temporary buffer.
+ * The handler must assume that the value will not survive after the callback
+ * function ends.
+ */
+struct parser_attribute
+{
+ /** Optional attribute namespace. It may be empty if it's not given. */
+ std::string_view ns;
+ /** Attribute name. */
+ std::string_view name;
+ /** Attribute value. */
+ std::string_view value;
+ /** Whether or not the attribute value is in a temporary buffer. */
+ bool transient;
+};
+
+class ORCUS_PSR_DLLPUBLIC parser_base : public ::orcus::parser_base
+{
+ struct impl;
+ std::unique_ptr<impl> mp_impl;
+
+ parser_base() = delete;
+ parser_base(const parser_base&) = delete;
+ parser_base& operator=(const parser_base&) = delete;
+protected:
+ size_t m_nest_level;
+ size_t m_buffer_pos;
+ bool m_root_elem_open:1;
+
+protected:
+ parser_base(const char* content, size_t size);
+ ~parser_base();
+
+ void next_check()
+ {
+ next();
+ if (!has_char())
+ throw malformed_xml_error("xml stream ended prematurely.", offset());
+ }
+
+ void nest_up() { ++m_nest_level; }
+ void nest_down()
+ {
+ if (m_nest_level == 0)
+ throw malformed_xml_error("incorrect nesting in xml stream", offset());
+
+ --m_nest_level;
+ }
+
+ void inc_buffer_pos();
+ void reset_buffer_pos() { m_buffer_pos = 0; }
+
+ void has_char_throw(const char* msg) const
+ {
+ if (!has_char())
+ throw malformed_xml_error(msg, offset());
+ }
+
+ char cur_char_checked() const
+ {
+ if (!has_char())
+ throw malformed_xml_error("xml stream ended prematurely.", offset());
+
+ return *mp_char;
+ }
+
+ char next_and_char()
+ {
+ next();
+#if ORCUS_DEBUG_SAX_PARSER
+ if (mp_char >= mp_end)
+ throw malformed_xml_error("xml stream ended prematurely.", offset());
+#endif
+ return *mp_char;
+ }
+
+ char next_char_checked()
+ {
+ next();
+ if (!has_char())
+ throw malformed_xml_error("xml stream ended prematurely.", offset());
+
+ return *mp_char;
+ }
+
+ cell_buffer& get_cell_buffer();
+
+ void comment();
+
+ void expects_next(const char* p, size_t n);
+
+ void parse_encoded_char(cell_buffer& buf);
+ void value_with_encoded_char(cell_buffer& buf, std::string_view& str, char quote_char);
+
+ /**
+ * Parse quoted value. Note that the retrieved string may be stored in a
+ * temporary cell buffer if the decode parameter is true. Use the string
+ * immediately after this call before the buffer becomes invalid.
+ *
+ * @note This method checks for valid stream; the caller doesn't need to
+ * check for valid stream before calling this method.
+ *
+ * @return true if the value is stored in temporary buffer, false
+ * otherwise.
+ */
+ bool value(std::string_view& str, bool decode);
+
+ void name(std::string_view& str);
+ void element_name(parser_element& elem, std::ptrdiff_t begin_pos);
+ void attribute_name(std::string_view& attr_ns, std::string_view& attr_name);
+ void characters_with_encoded_char(cell_buffer& buf);
+};
+
+}}
+
+#endif
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */