From b6e36923b3106b108b63fe7714292840ae6100e2 Mon Sep 17 00:00:00 2001 From: Gareth Pendleton Date: Thu, 9 Oct 2025 14:01:50 +0100 Subject: [PATCH] feat: Added charref parsing --- src/glxml.gleam | 88 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/src/glxml.gleam b/src/glxml.gleam index 1e5c0fb..da1d18d 100644 --- a/src/glxml.gleam +++ b/src/glxml.gleam @@ -1,5 +1,6 @@ import gleam/bool import gleam/dict +import gleam/int import gleam/list import gleam/option.{type Option, None, Some} import gleam/result @@ -10,7 +11,9 @@ pub type Declaration { } pub type Entity { - Entity + InternalEntity(value: String) + SystemExternalEntity(literal: String) + PublicExternalEntity(literal: String, pubidliteral: String) } pub type DocType { @@ -36,11 +39,21 @@ pub type Element { pub fn main() { parse_document( - "\r\n \n", + "\r\n \n", ) |> echo } +pub fn default_entities() -> dict.Dict(String, Entity) { + dict.from_list([ + #("lt", InternalEntity("&#60;")), + #("gt", InternalEntity("#62;")), + #("amp", InternalEntity("&#38;")), + #("apos", InternalEntity("'")), + #("quot", InternalEntity(""")), + ]) +} + fn parse_document(doc: String) -> Result(Document, Nil) { use #(decl, doctype, doc) <- result.try(parse_prolog(doc)) use <- bool.guard(when: doc == "", return: Ok(Document(decl, doctype, None))) @@ -162,7 +175,35 @@ fn parse_reference( doctype: Option(DocType), ) -> Result(#(String, String), Nil) { case doc { - "&" as char <> tail | "%" as char <> tail -> { + "&#" <> tail -> { + case tail { + "x" <> tail -> { + use #(digits, doc) <- result.try(parse_multiple(tail, parse_hex_digit)) + + case doc { + ";" <> tail -> { + use value <- result.try(int.base_parse(digits, 16)) + use codepoint <- result.try(string.utf_codepoint(value)) + Ok(#("", string.from_utf_codepoints([codepoint]) <> tail)) + } + _ -> Error(Nil) + } + } + _ -> { + use #(digits, doc) <- result.try(parse_multiple(tail, parse_digit)) + + case doc { + ";" <> tail -> { + use value <- result.try(int.base_parse(digits, 10)) + use codepoint <- result.try(string.utf_codepoint(value)) + Ok(#("", string.from_utf_codepoints([codepoint]) <> tail)) + } + _ -> Error(Nil) + } + } + } + } + "&" as char <> tail -> { use #(name, doc) <- result.try(parse_name(tail)) case doc { @@ -269,7 +310,7 @@ fn do_parse_version( doc: String, version: String, ) -> Result(#(String, String), Nil) { - case do_parse_digit(doc) { + case parse_digit(doc) { Ok(#(digit, doc)) -> do_parse_version(doc, version <> digit) Error(_) if version == "" -> Error(Nil) Error(_) -> Ok(#(version, doc)) @@ -302,14 +343,14 @@ fn parse_encodingdecl(doc: String) -> Result(#(String, String), Nil) { } fn parse_encoding(doc: String) -> Result(#(String, String), Nil) { - case do_parse_alpha(doc) { + case parse_alpha(doc) { Ok(#(char, doc)) -> { Ok(parse_multiple_optional( doc, try_parsers( [ - do_parse_alpha, - do_parse_digit, + parse_alpha, + parse_digit, fn(doc) { case doc { "." as char <> tail | "_" as char <> tail | "-" as char <> tail -> @@ -336,7 +377,7 @@ fn parse_standalone(doc: String) -> Result(#(Bool, String), Nil) { } } -fn do_parse_digit(doc: String) -> Result(#(String, String), Nil) { +fn parse_digit(doc: String) -> Result(#(String, String), Nil) { case doc { "0" as digit <> tail | "1" as digit <> tail @@ -352,7 +393,36 @@ fn do_parse_digit(doc: String) -> Result(#(String, String), Nil) { } } -fn do_parse_alpha(doc: String) -> Result(#(String, String), Nil) { +pub fn parse_hex_digit(str: String) -> Result(#(String, String), Nil) { + case str { + "0" as digit <> tail + | "1" as digit <> tail + | "2" as digit <> tail + | "3" as digit <> tail + | "4" as digit <> tail + | "5" as digit <> tail + | "6" as digit <> tail + | "7" as digit <> tail + | "8" as digit <> tail + | "9" as digit <> tail + | "a" as digit <> tail + | "b" as digit <> tail + | "c" as digit <> tail + | "d" as digit <> tail + | "e" as digit <> tail + | "f" as digit <> tail + | "A" as digit <> tail + | "B" as digit <> tail + | "C" as digit <> tail + | "D" as digit <> tail + | "E" as digit <> tail + | "F" as digit <> tail -> Ok(#(digit, tail)) + + _ -> Error(Nil) + } +} + +fn parse_alpha(doc: String) -> Result(#(String, String), Nil) { case doc { "a" as char <> tail | "b" as char <> tail