Compare commits

..

5 Commits

Author SHA1 Message Date
616173c5c0 feat: Started adding references
Some checks failed
test / test (push) Has been cancelled
2025-10-09 14:56:04 +01:00
b6e36923b3 feat: Added charref parsing
Some checks failed
test / test (push) Has been cancelled
2025-10-09 14:01:50 +01:00
d87b582056 feat: Parse empty element
Some checks failed
test / test (push) Has been cancelled
2025-10-09 12:43:59 +01:00
26499833c0 feat: Added comment parsing
Some checks failed
test / test (push) Has been cancelled
2025-10-09 01:26:55 +01:00
25fe8c7264 feat: Added standalone property and tweaked xmldecl parsing
Some checks failed
test / test (push) Has been cancelled
2025-10-09 00:54:21 +01:00

View File

@@ -1,34 +1,291 @@
import gleam/bool
import gleam/dict
import gleam/int
import gleam/list
import gleam/option.{type Option, None, Some}
import gleam/result import gleam/result
import gleam/string
pub type Declaration { pub type Declaration {
Declaration(versioninfo: String, encoding: String) Declaration(versioninfo: String, encoding: String, standalone: Bool)
}
pub type Entity {
InternalEntity(value: String)
SystemExternalEntity(literal: String)
PublicExternalEntity(literal: String, pubidliteral: String)
} }
pub type DocType { pub type DocType {
None DocType(name: String, entities: dict.Dict(String, Entity))
DocType(name: String)
} }
pub type Document { pub type Document {
Document(decl: Declaration, doctype: DocType) Document(
decl: Declaration,
doctype: Option(DocType),
element: Option(Element),
)
}
pub type Attribute {
Attribute(name: String, value: String)
}
pub type Element {
EmptyElem(name: String, attrs: List(Attribute))
Element(name: String, attrs: List(Attribute), elements: List(Element))
} }
pub fn main() { pub fn main() {
parse_document("<?xml version=\"1.1\" encoding='UTF-8'?>") |> echo parse_document(
"<?xml version=\"1.1\" encoding='UTF-8'?>\r\n <!-- hello-world --> \n<a attr='ha&#x20;&#38;#38;ha' battr='baba' ref='&amp;'/>",
)
|> echo
}
pub fn default_entities() -> dict.Dict(String, Entity) {
dict.from_list([
#("lt", InternalEntity("&#60;")),
#("gt", InternalEntity("&#62;")),
#("amp", InternalEntity("&#38;")),
#("apos", InternalEntity("&#39;")),
#("quot", InternalEntity("&#34;")),
])
} }
fn parse_document(doc: String) -> Result(Document, Nil) { fn parse_document(doc: String) -> Result(Document, Nil) {
use #(decl, doctype, _doc) <- result.try(parse_prolog(doc)) use #(decl, doctype, doc) <- result.try(parse_prolog(doc))
use <- bool.guard(when: doc == "", return: Ok(Document(decl, doctype, None)))
use #(element, doc) <- result.try(parse_element(doc, doctype))
let doc = parse_misc(doc)
Ok(Document(decl, doctype)) case doc {
"" -> Ok(Document(decl, doctype, Some(element)))
_ -> Error(Nil)
}
} }
fn parse_prolog(doc: String) -> Result(#(Declaration, DocType, String), Nil) { fn parse_element(
use #(decl, doc) <- result.try(parse_decl(doc)) doc: String,
doctype: Option(DocType),
) -> Result(#(Element, String), Nil) {
try_parsers([parse_empty_elem(_, doctype)], doc)
}
fn parse_empty_elem(
doc: String,
doctype: Option(DocType),
) -> Result(#(Element, String), Nil) {
case doc {
"<" <> tail -> {
use #(name, doc) <- result.try(parse_name(tail))
use #(attrs, doc) <- result.try(parse_attributes(doc, doctype, []))
let doc = trim_space(doc)
case doc {
"/>" <> tail -> Ok(#(EmptyElem(name, attrs), tail))
_ -> Error(Nil)
}
}
_ -> Error(Nil)
}
}
fn parse_attributes(
doc: String,
doctype: Option(DocType),
attrs: List(Attribute),
) -> Result(#(List(Attribute), String), Nil) {
case parse_attribute(doc, doctype) {
Ok(#(attr, doc)) -> parse_attributes(doc, doctype, [attr, ..attrs])
Error(_) -> Ok(#(list.reverse(attrs), doc))
}
}
fn parse_attribute(
doc: String,
doctype: Option(DocType),
) -> Result(#(Attribute, String), Nil) {
let doc = trim_space(doc)
use #(name, doc) <- result.try(parse_name(doc))
case doc {
"=" <> tail -> {
case tail {
"\"" <> tail -> {
let #(value, doc) =
parse_multiple_optional(
tail,
try_parsers(
[
fn(doc) {
case string.pop_grapheme(doc) {
Ok(#(char, _doc))
if char == "<" || char == "&" || char == "\""
-> Error(Nil)
Ok(#(char, doc)) -> Ok(#(char, doc))
Error(_) -> Error(Nil)
}
},
parse_reference(_, doctype),
],
_,
),
"",
)
case doc {
"\"" <> tail -> Ok(#(Attribute(name, value), tail))
_ -> Error(Nil)
}
}
"'" <> tail -> {
let #(value, doc) =
parse_multiple_optional(
tail,
try_parsers(
[
fn(doc) {
case string.pop_grapheme(doc) {
Ok(#(char, _doc))
if char == "<" || char == "&" || char == "'"
-> Error(Nil)
Ok(#(char, doc)) -> Ok(#(char, doc))
Error(_) -> Error(Nil)
}
},
parse_reference(_, doctype),
],
_,
),
"",
)
case doc {
"'" <> tail -> Ok(#(Attribute(name, value), tail))
_ -> Error(Nil)
}
}
_ -> Error(Nil)
}
}
_ -> Error(Nil)
}
}
fn parse_reference(
doc: String,
doctype: Option(DocType),
) -> Result(#(String, String), Nil) {
case doc {
"&#" <> tail -> {
case tail {
"x" <> tail -> {
use #(digits, doc) <- result.try(parse_multiple(tail, parse_hex_digit))
case doc {
";" <> tail -> {
use value <- result.try(int.base_parse(digits, 16))
use codepoint <- result.try(string.utf_codepoint(value))
Ok(#(string.from_utf_codepoints([codepoint]), tail))
}
_ -> Error(Nil)
}
}
_ -> {
use #(digits, doc) <- result.try(parse_multiple(tail, parse_digit))
case doc {
";" <> tail -> {
use value <- result.try(int.base_parse(digits, 10))
use codepoint <- result.try(string.utf_codepoint(value))
Ok(#(string.from_utf_codepoints([codepoint]), tail))
}
_ -> Error(Nil)
}
}
}
}
"&" as char <> tail -> {
use #(name, doc) <- result.try(parse_name(tail))
case doc {
";" <> tail -> {
use value <- result.try(process_reference(name, doctype))
Ok(#("", value <> tail))
}
_ -> Error(Nil)
}
}
_ -> Error(Nil)
}
}
fn process_reference(
ref: String,
doctype: Option(DocType),
) -> Result(String, Nil) {
case doctype {
Some(DocType(_, entities)) -> {
get_reference(entities, ref)
}
None -> {
get_reference(default_entities(), ref)
}
}
}
fn get_reference(
entities: dict.Dict(String, Entity),
ref: String,
) -> Result(String, Nil) {
case dict.get(entities, ref) {
Ok(InternalEntity(val)) -> Ok(val)
Ok(PublicExternalEntity(_, _)) | Ok(SystemExternalEntity(_)) -> Error(Nil)
Error(_) -> {
case entities == default_entities() {
True -> Error(Nil)
False -> get_reference(default_entities(), ref)
}
}
}
}
fn parse_name(doc: String) -> Result(#(String, String), Nil) {
case parse_name_start_char(doc) {
Ok(#(char, tail)) -> {
do_parse_name(tail, char)
}
Error(_) -> Error(Nil)
}
}
fn do_parse_name(doc: String, name: String) -> Result(#(String, String), Nil) {
case parse_name_char(doc) {
Ok(#(char, tail)) -> do_parse_name(tail, name <> char)
Error(_) -> Ok(#(name, doc))
}
}
fn parse_prolog(
doc: String,
) -> Result(#(Declaration, Option(DocType), String), Nil) {
let #(decl, doc) = case parse_decl(doc) {
Ok(#(decl, doc)) -> #(decl, doc)
_ -> #(Declaration("1.0", "UTF-8", False), doc)
}
let doc = parse_misc(doc)
Ok(#(decl, None, doc)) Ok(#(decl, None, doc))
} }
fn parse_misc(doc: String) -> String {
let #(_, doc) =
parse_multiple_optional(
doc,
try_parsers([parse_comment, parse_space], _),
"",
)
doc
}
fn parse_decl(doc: String) -> Result(#(Declaration, String), Nil) { fn parse_decl(doc: String) -> Result(#(Declaration, String), Nil) {
case doc { case doc {
"<?xml" <> tail -> { "<?xml" <> tail -> {
@@ -37,9 +294,14 @@ fn parse_decl(doc: String) -> Result(#(Declaration, String), Nil) {
Ok(e) -> e Ok(e) -> e
Error(_) -> #("", doc) Error(_) -> #("", doc)
} }
let #(standalone, doc) = case parse_standalone(doc) {
Ok(e) -> e
Error(_) -> #(False, doc)
}
case trim_space(doc) { case trim_space(doc) {
"?>" <> tail -> Ok(#(Declaration(versioninfo:, encoding:), tail)) "?>" <> tail ->
Ok(#(Declaration(versioninfo:, encoding:, standalone:), tail))
_ -> Error(Nil) _ -> Error(Nil)
} }
} }
@@ -48,8 +310,7 @@ fn parse_decl(doc: String) -> Result(#(Declaration, String), Nil) {
} }
fn parse_versioninfo(doc: String) -> Result(#(String, String), Nil) { fn parse_versioninfo(doc: String) -> Result(#(String, String), Nil) {
use #(_, doc) <- result.try(parse_space(doc)) case trim_space(doc) {
case doc {
"version=" <> tail -> { "version=" <> tail -> {
use #(version, doc) <- result.try(parse_version(tail)) use #(version, doc) <- result.try(parse_version(tail))
Ok(#(version, doc)) Ok(#(version, doc))
@@ -82,7 +343,7 @@ fn do_parse_version(
doc: String, doc: String,
version: String, version: String,
) -> Result(#(String, String), Nil) { ) -> Result(#(String, String), Nil) {
case do_parse_digit(doc) { case parse_digit(doc) {
Ok(#(digit, doc)) -> do_parse_version(doc, version <> digit) Ok(#(digit, doc)) -> do_parse_version(doc, version <> digit)
Error(_) if version == "" -> Error(Nil) Error(_) if version == "" -> Error(Nil)
Error(_) -> Ok(#(version, doc)) Error(_) -> Ok(#(version, doc))
@@ -90,9 +351,7 @@ fn do_parse_version(
} }
fn parse_encodingdecl(doc: String) -> Result(#(String, String), Nil) { fn parse_encodingdecl(doc: String) -> Result(#(String, String), Nil) {
use #(_, doc) <- result.try(parse_space(doc)) case trim_space(doc) {
case doc {
"encoding=" <> tail -> { "encoding=" <> tail -> {
case tail { case tail {
"\"" <> tail -> { "\"" <> tail -> {
@@ -117,14 +376,14 @@ fn parse_encodingdecl(doc: String) -> Result(#(String, String), Nil) {
} }
fn parse_encoding(doc: String) -> Result(#(String, String), Nil) { fn parse_encoding(doc: String) -> Result(#(String, String), Nil) {
case do_parse_alpha(doc) { case parse_alpha(doc) {
Ok(#(char, doc)) -> { Ok(#(char, doc)) -> {
Ok(parse_multiple_optional( Ok(parse_multiple_optional(
doc, doc,
try_parsers( try_parsers(
[ [
do_parse_alpha, parse_alpha,
do_parse_digit, parse_digit,
fn(doc) { fn(doc) {
case doc { case doc {
"." as char <> tail | "_" as char <> tail | "-" as char <> tail -> "." as char <> tail | "_" as char <> tail | "-" as char <> tail ->
@@ -142,7 +401,16 @@ fn parse_encoding(doc: String) -> Result(#(String, String), Nil) {
} }
} }
fn do_parse_digit(doc: String) -> Result(#(String, String), Nil) { fn parse_standalone(doc: String) -> Result(#(Bool, String), Nil) {
case trim_space(doc) {
"standalone=\"yes\"" <> tail | "standalone='yes'" <> tail ->
Ok(#(True, tail))
"standalone=\"no\"" <> tail | "standalone='no'" <> tail -> Ok(#(True, tail))
_ -> Error(Nil)
}
}
fn parse_digit(doc: String) -> Result(#(String, String), Nil) {
case doc { case doc {
"0" as digit <> tail "0" as digit <> tail
| "1" as digit <> tail | "1" as digit <> tail
@@ -158,7 +426,36 @@ fn do_parse_digit(doc: String) -> Result(#(String, String), Nil) {
} }
} }
fn do_parse_alpha(doc: String) -> Result(#(String, String), Nil) { pub fn parse_hex_digit(str: String) -> Result(#(String, String), Nil) {
case str {
"0" as digit <> tail
| "1" as digit <> tail
| "2" as digit <> tail
| "3" as digit <> tail
| "4" as digit <> tail
| "5" as digit <> tail
| "6" as digit <> tail
| "7" as digit <> tail
| "8" as digit <> tail
| "9" as digit <> tail
| "a" as digit <> tail
| "b" as digit <> tail
| "c" as digit <> tail
| "d" as digit <> tail
| "e" as digit <> tail
| "f" as digit <> tail
| "A" as digit <> tail
| "B" as digit <> tail
| "C" as digit <> tail
| "D" as digit <> tail
| "E" as digit <> tail
| "F" as digit <> tail -> Ok(#(digit, tail))
_ -> Error(Nil)
}
}
fn parse_alpha(doc: String) -> Result(#(String, String), Nil) {
case doc { case doc {
"a" as char <> tail "a" as char <> tail
| "b" as char <> tail | "b" as char <> tail
@@ -216,6 +513,127 @@ fn do_parse_alpha(doc: String) -> Result(#(String, String), Nil) {
} }
} }
fn parse_comment(doc: String) -> Result(#(String, String), Nil) {
case doc {
"<!--" <> tail -> {
let #(comment, doc) = do_parse_comment(tail)
case doc {
"-->" <> tail -> Ok(#(comment, tail))
_ -> Error(Nil)
}
}
_ -> Error(Nil)
}
}
fn do_parse_comment(doc: String) -> #(String, String) {
parse_multiple_optional(
doc,
try_parsers(
[
parse_char_except_dash,
fn(doc) {
case doc {
"-" <> tail -> {
use #(char, doc) <- result.try(parse_char_except_dash(tail))
Ok(#("-" <> char, doc))
}
_ -> Error(Nil)
}
},
],
_,
),
"",
)
}
fn parse_char_except_dash(doc: String) -> Result(#(String, String), Nil) {
case doc {
"-" <> _ -> Error(Nil)
_ -> parse_char(doc)
}
}
fn parse_char(doc: String) -> Result(#(String, String), Nil) {
case string.pop_grapheme(doc) {
Ok(#("\r\n", tail)) -> Ok(#("\r\n", tail))
Ok(#("\t", tail)) -> Ok(#("\t", tail))
Ok(#("\n", tail)) -> Ok(#("\n", tail))
Ok(#("\r", tail)) -> Ok(#("\r", tail))
Ok(#(char, tail)) -> {
let assert [codepoint] = string.to_utf_codepoints(char)
case string.utf_codepoint_to_int(codepoint) {
i if i >= 0x20 && i <= 0xD7FF -> Ok(#(char, tail))
i if i >= 0xE000 && i <= 0xFFFD -> Ok(#(char, tail))
i if i >= 0x10000 && i <= 0x10FFFF -> Ok(#(char, tail))
_ -> Error(Nil)
}
}
Error(_) -> Error(Nil)
}
}
fn parse_name_start_char(doc: String) -> Result(#(String, String), Nil) {
case string.pop_grapheme(doc) {
Ok(#(":", tail)) -> Ok(#(":", tail))
Ok(#("_", tail)) -> Ok(#("_", tail))
Ok(#(char, tail)) -> {
let assert [codepoint] = string.to_utf_codepoints(char)
case string.utf_codepoint_to_int(codepoint) {
i if i >= 0x41 && i <= 0x5A -> Ok(#(char, tail))
i if i >= 0x61 && i <= 0x7A -> Ok(#(char, tail))
i if i >= 0xC0 && i <= 0xD6 -> Ok(#(char, tail))
i if i >= 0xD8 && i <= 0xF6 -> Ok(#(char, tail))
i if i >= 0xF8 && i <= 0x2FF -> Ok(#(char, tail))
i if i >= 0x370 && i <= 0x37D -> Ok(#(char, tail))
i if i >= 0x37F && i <= 0x1FFF -> Ok(#(char, tail))
i if i >= 0x200C && i <= 0x200D -> Ok(#(char, tail))
i if i >= 0x2070 && i <= 0x218F -> Ok(#(char, tail))
i if i >= 0x2C00 && i <= 0x2FEF -> Ok(#(char, tail))
i if i >= 0x3000 && i <= 0xD7FF -> Ok(#(char, tail))
i if i >= 0xF900 && i <= 0xFDCF -> Ok(#(char, tail))
i if i >= 0xFDF0 && i <= 0xFFFD -> Ok(#(char, tail))
i if i >= 0x10000 && i <= 0xEFFFF -> Ok(#(char, tail))
_ -> Error(Nil)
}
}
Error(_) -> Error(Nil)
}
}
fn parse_name_char(doc: String) -> Result(#(String, String), Nil) {
case string.pop_grapheme(doc) {
Ok(#(":", tail)) -> Ok(#(":", tail))
Ok(#("_", tail)) -> Ok(#("_", tail))
Ok(#("-", tail)) -> Ok(#("-", tail))
Ok(#(".", tail)) -> Ok(#(".", tail))
Ok(#(char, tail)) -> {
let assert [codepoint] = string.to_utf_codepoints(char)
case string.utf_codepoint_to_int(codepoint) {
i if i >= 0x30 && i <= 0x39 -> Ok(#(char, tail))
i if i == 0xB7 -> Ok(#(char, tail))
i if i >= 0x41 && i <= 0x5A -> Ok(#(char, tail))
i if i >= 0x61 && i <= 0x7A -> Ok(#(char, tail))
i if i >= 0xC0 && i <= 0xD6 -> Ok(#(char, tail))
i if i >= 0xD8 && i <= 0xF6 -> Ok(#(char, tail))
i if i >= 0xF8 && i <= 0x37D -> Ok(#(char, tail))
i if i >= 0x37F && i <= 0x1FFF -> Ok(#(char, tail))
i if i >= 0x200C && i <= 0x200D -> Ok(#(char, tail))
i if i >= 0x203F && i <= 0x2040 -> Ok(#(char, tail))
i if i >= 0x2070 && i <= 0x218F -> Ok(#(char, tail))
i if i >= 0x2C00 && i <= 0x2FEF -> Ok(#(char, tail))
i if i >= 0x3000 && i <= 0xD7FF -> Ok(#(char, tail))
i if i >= 0xF900 && i <= 0xFDCF -> Ok(#(char, tail))
i if i >= 0xFDF0 && i <= 0xFFFD -> Ok(#(char, tail))
i if i >= 0x10000 && i <= 0xEFFFF -> Ok(#(char, tail))
_ -> Error(Nil)
}
}
Error(_) -> Error(Nil)
}
}
fn trim_space(doc: String) -> String { fn trim_space(doc: String) -> String {
case parse_space(doc) { case parse_space(doc) {
Ok(#(_, doc)) -> trim_space(doc) Ok(#(_, doc)) -> trim_space(doc)