feat: Added charref parsing
Some checks failed
test / test (push) Has been cancelled

This commit is contained in:
2025-10-09 14:01:50 +01:00
parent d87b582056
commit b6e36923b3

View File

@@ -1,5 +1,6 @@
import gleam/bool import gleam/bool
import gleam/dict import gleam/dict
import gleam/int
import gleam/list import gleam/list
import gleam/option.{type Option, None, Some} import gleam/option.{type Option, None, Some}
import gleam/result import gleam/result
@@ -10,7 +11,9 @@ pub type Declaration {
} }
pub type Entity { pub type Entity {
Entity InternalEntity(value: String)
SystemExternalEntity(literal: String)
PublicExternalEntity(literal: String, pubidliteral: String)
} }
pub type DocType { pub type DocType {
@@ -36,11 +39,21 @@ pub type Element {
pub fn main() { pub fn main() {
parse_document( parse_document(
"<?xml version=\"1.1\" encoding='UTF-8'?>\r\n <!-- hello-world --> \n<a attr='haha' battr='baba' ref='&ref;'/>", "<?xml version=\"1.1\" encoding='UTF-8'?>\r\n <!-- hello-world --> \n<a attr='ha&#x20;ha' battr='baba' ref='&ref;'/>",
) )
|> echo |> echo
} }
pub fn default_entities() -> dict.Dict(String, Entity) {
dict.from_list([
#("lt", InternalEntity("&#38;#60;")),
#("gt", InternalEntity("#62;")),
#("amp", InternalEntity("&#38;#38;")),
#("apos", InternalEntity("&#39;")),
#("quot", InternalEntity("&#34;")),
])
}
fn parse_document(doc: String) -> Result(Document, Nil) { fn parse_document(doc: String) -> Result(Document, Nil) {
use #(decl, doctype, doc) <- result.try(parse_prolog(doc)) use #(decl, doctype, doc) <- result.try(parse_prolog(doc))
use <- bool.guard(when: doc == "", return: Ok(Document(decl, doctype, None))) use <- bool.guard(when: doc == "", return: Ok(Document(decl, doctype, None)))
@@ -162,7 +175,35 @@ fn parse_reference(
doctype: Option(DocType), doctype: Option(DocType),
) -> Result(#(String, String), Nil) { ) -> Result(#(String, String), Nil) {
case doc { case doc {
"&" as char <> tail | "%" as char <> tail -> { "&#" <> tail -> {
case tail {
"x" <> tail -> {
use #(digits, doc) <- result.try(parse_multiple(tail, parse_hex_digit))
case doc {
";" <> tail -> {
use value <- result.try(int.base_parse(digits, 16))
use codepoint <- result.try(string.utf_codepoint(value))
Ok(#("", string.from_utf_codepoints([codepoint]) <> tail))
}
_ -> Error(Nil)
}
}
_ -> {
use #(digits, doc) <- result.try(parse_multiple(tail, parse_digit))
case doc {
";" <> tail -> {
use value <- result.try(int.base_parse(digits, 10))
use codepoint <- result.try(string.utf_codepoint(value))
Ok(#("", string.from_utf_codepoints([codepoint]) <> tail))
}
_ -> Error(Nil)
}
}
}
}
"&" as char <> tail -> {
use #(name, doc) <- result.try(parse_name(tail)) use #(name, doc) <- result.try(parse_name(tail))
case doc { case doc {
@@ -269,7 +310,7 @@ fn do_parse_version(
doc: String, doc: String,
version: String, version: String,
) -> Result(#(String, String), Nil) { ) -> Result(#(String, String), Nil) {
case do_parse_digit(doc) { case parse_digit(doc) {
Ok(#(digit, doc)) -> do_parse_version(doc, version <> digit) Ok(#(digit, doc)) -> do_parse_version(doc, version <> digit)
Error(_) if version == "" -> Error(Nil) Error(_) if version == "" -> Error(Nil)
Error(_) -> Ok(#(version, doc)) Error(_) -> Ok(#(version, doc))
@@ -302,14 +343,14 @@ fn parse_encodingdecl(doc: String) -> Result(#(String, String), Nil) {
} }
fn parse_encoding(doc: String) -> Result(#(String, String), Nil) { fn parse_encoding(doc: String) -> Result(#(String, String), Nil) {
case do_parse_alpha(doc) { case parse_alpha(doc) {
Ok(#(char, doc)) -> { Ok(#(char, doc)) -> {
Ok(parse_multiple_optional( Ok(parse_multiple_optional(
doc, doc,
try_parsers( try_parsers(
[ [
do_parse_alpha, parse_alpha,
do_parse_digit, parse_digit,
fn(doc) { fn(doc) {
case doc { case doc {
"." as char <> tail | "_" as char <> tail | "-" as char <> tail -> "." as char <> tail | "_" as char <> tail | "-" as char <> tail ->
@@ -336,7 +377,7 @@ fn parse_standalone(doc: String) -> Result(#(Bool, String), Nil) {
} }
} }
fn do_parse_digit(doc: String) -> Result(#(String, String), Nil) { fn parse_digit(doc: String) -> Result(#(String, String), Nil) {
case doc { case doc {
"0" as digit <> tail "0" as digit <> tail
| "1" as digit <> tail | "1" as digit <> tail
@@ -352,7 +393,36 @@ fn do_parse_digit(doc: String) -> Result(#(String, String), Nil) {
} }
} }
fn do_parse_alpha(doc: String) -> Result(#(String, String), Nil) { pub fn parse_hex_digit(str: String) -> Result(#(String, String), Nil) {
case str {
"0" as digit <> tail
| "1" as digit <> tail
| "2" as digit <> tail
| "3" as digit <> tail
| "4" as digit <> tail
| "5" as digit <> tail
| "6" as digit <> tail
| "7" as digit <> tail
| "8" as digit <> tail
| "9" as digit <> tail
| "a" as digit <> tail
| "b" as digit <> tail
| "c" as digit <> tail
| "d" as digit <> tail
| "e" as digit <> tail
| "f" as digit <> tail
| "A" as digit <> tail
| "B" as digit <> tail
| "C" as digit <> tail
| "D" as digit <> tail
| "E" as digit <> tail
| "F" as digit <> tail -> Ok(#(digit, tail))
_ -> Error(Nil)
}
}
fn parse_alpha(doc: String) -> Result(#(String, String), Nil) {
case doc { case doc {
"a" as char <> tail "a" as char <> tail
| "b" as char <> tail | "b" as char <> tail