diff --git a/src/glxml.gleam b/src/glxml.gleam index deb633c..1e5c0fb 100644 --- a/src/glxml.gleam +++ b/src/glxml.gleam @@ -1,4 +1,7 @@ -import gleam/option.{type Option, None} +import gleam/bool +import gleam/dict +import gleam/list +import gleam/option.{type Option, None, Some} import gleam/result import gleam/string @@ -6,25 +9,185 @@ pub type Declaration { Declaration(versioninfo: String, encoding: String, standalone: Bool) } +pub type Entity { + Entity +} + pub type DocType { - DocType(name: String) + DocType(name: String, entities: dict.Dict(String, Entity)) } pub type Document { - Document(decl: Declaration, doctype: Option(DocType)) + Document( + decl: Declaration, + doctype: Option(DocType), + element: Option(Element), + ) +} + +pub type Attribute { + Attribute(name: String, value: String) +} + +pub type Element { + EmptyElem(name: String, attrs: List(Attribute)) + Element(name: String, attrs: List(Attribute), elements: List(Element)) } pub fn main() { parse_document( - "\r\n \n", + "\r\n \n", ) |> echo } fn parse_document(doc: String) -> Result(Document, Nil) { - use #(decl, doctype, _doc) <- result.try(parse_prolog(doc)) + use #(decl, doctype, doc) <- result.try(parse_prolog(doc)) + use <- bool.guard(when: doc == "", return: Ok(Document(decl, doctype, None))) + use #(element, doc) <- result.try(parse_element(doc, doctype)) + let doc = parse_misc(doc) - Ok(Document(decl, doctype)) + case doc |> echo { + "" -> Ok(Document(decl, doctype, Some(element))) + _ -> Error(Nil) + } +} + +fn parse_element( + doc: String, + doctype: Option(DocType), +) -> Result(#(Element, String), Nil) { + try_parsers([parse_empty_elem(_, doctype)], doc) +} + +fn parse_empty_elem( + doc: String, + doctype: Option(DocType), +) -> Result(#(Element, String), Nil) { + case doc |> echo { + "<" <> tail -> { + use #(name, doc) <- result.try(parse_name(tail)) + use #(attrs, doc) <- result.try(parse_attributes(doc, doctype, [])) + let doc = trim_space(doc) + case doc { + "/>" <> tail -> Ok(#(EmptyElem(name, attrs), tail)) + _ -> Error(Nil) + } + } + _ -> Error(Nil) + } +} + +fn parse_attributes( + doc: String, + doctype: Option(DocType), + attrs: List(Attribute), +) -> Result(#(List(Attribute), String), Nil) { + case parse_attribute(doc, doctype) { + Ok(#(attr, doc)) -> parse_attributes(doc, doctype, [attr, ..attrs]) + Error(_) -> Ok(#(list.reverse(attrs), doc)) + } +} + +fn parse_attribute( + doc: String, + doctype: Option(DocType), +) -> Result(#(Attribute, String), Nil) { + let doc = trim_space(doc) + use #(name, doc) <- result.try(parse_name(doc)) + case doc { + "=" <> tail -> { + case tail { + "\"" <> tail -> { + let #(value, doc) = + parse_multiple_optional( + tail, + try_parsers( + [ + fn(doc) { + case string.pop_grapheme(doc) { + Ok(#(char, _doc)) + if char == "<" || char == "&" || char == "\"" + -> Error(Nil) + Ok(#(char, doc)) -> Ok(#(char, doc)) + Error(_) -> Error(Nil) + } + }, + parse_reference(_, doctype), + ], + _, + ), + "", + ) + case doc { + "\"" <> tail -> Ok(#(Attribute(name, value), tail)) + _ -> Error(Nil) + } + } + "'" <> tail -> { + let #(value, doc) = + parse_multiple_optional( + tail, + try_parsers( + [ + fn(doc) { + case string.pop_grapheme(doc) { + Ok(#(char, _doc)) + if char == "<" || char == "&" || char == "'" + -> Error(Nil) + Ok(#(char, doc)) -> Ok(#(char, doc)) + Error(_) -> Error(Nil) + } + }, + parse_reference(_, doctype), + ], + _, + ), + "", + ) + case doc { + "'" <> tail -> Ok(#(Attribute(name, value), tail)) + _ -> Error(Nil) + } + } + _ -> Error(Nil) + } + } + _ -> Error(Nil) + } +} + +fn parse_reference( + doc: String, + doctype: Option(DocType), +) -> Result(#(String, String), Nil) { + case doc { + "&" as char <> tail | "%" as char <> tail -> { + use #(name, doc) <- result.try(parse_name(tail)) + + case doc { + ";" <> tail -> Ok(#(char <> name <> ";", tail)) + _ -> Error(Nil) + } + } + _ -> Error(Nil) + } +} + +fn parse_name(doc: String) -> Result(#(String, String), Nil) { + case parse_name_start_char(doc) { + Ok(#(char, tail)) -> { + do_parse_name(tail, char) + } + Error(_) -> Error(Nil) + } +} + +fn do_parse_name(doc: String, name: String) -> Result(#(String, String), Nil) { + case parse_name_char(doc) { + Ok(#(char, tail)) -> do_parse_name(tail, name <> char) + Error(_) -> Ok(#(name, doc)) + } } fn parse_prolog( @@ -34,15 +197,19 @@ fn parse_prolog( Ok(#(decl, doc)) -> #(decl, doc) _ -> #(Declaration("1.0", "UTF-8", False), doc) } - let #(comment, doc) = + let doc = parse_misc(doc) + + Ok(#(decl, None, doc)) +} + +fn parse_misc(doc: String) -> String { + let #(_, doc) = parse_multiple_optional( doc, try_parsers([parse_comment, parse_space], _), "", ) - comment |> echo - doc |> echo - Ok(#(decl, None, doc)) + doc } fn parse_decl(doc: String) -> Result(#(Declaration, String), Nil) { @@ -69,8 +236,7 @@ fn parse_decl(doc: String) -> Result(#(Declaration, String), Nil) { } fn parse_versioninfo(doc: String) -> Result(#(String, String), Nil) { - use #(_, doc) <- result.try(parse_space(doc)) - case doc { + case trim_space(doc) { "version=" <> tail -> { use #(version, doc) <- result.try(parse_version(tail)) Ok(#(version, doc)) @@ -111,9 +277,7 @@ fn do_parse_version( } fn parse_encodingdecl(doc: String) -> Result(#(String, String), Nil) { - use #(_, doc) <- result.try(parse_space(doc)) - - case doc { + case trim_space(doc) { "encoding=" <> tail -> { case tail { "\"" <> tail -> { @@ -164,9 +328,7 @@ fn parse_encoding(doc: String) -> Result(#(String, String), Nil) { } fn parse_standalone(doc: String) -> Result(#(Bool, String), Nil) { - use #(_, doc) <- result.try(parse_space(doc)) - - case doc { + case trim_space(doc) { "standalone=\"yes\"" <> tail | "standalone='yes'" <> tail -> Ok(#(True, tail)) "standalone=\"no\"" <> tail | "standalone='no'" <> tail -> Ok(#(True, tail)) @@ -309,6 +471,66 @@ fn parse_char(doc: String) -> Result(#(String, String), Nil) { } } +fn parse_name_start_char(doc: String) -> Result(#(String, String), Nil) { + case string.pop_grapheme(doc) { + Ok(#(":", tail)) -> Ok(#(":", tail)) + Ok(#("_", tail)) -> Ok(#("_", tail)) + Ok(#(char, tail)) -> { + let assert [codepoint] = string.to_utf_codepoints(char) + case string.utf_codepoint_to_int(codepoint) { + i if i >= 0x41 && i <= 0x5A -> Ok(#(char, tail)) + i if i >= 0x61 && i <= 0x7A -> Ok(#(char, tail)) + i if i >= 0xC0 && i <= 0xD6 -> Ok(#(char, tail)) + i if i >= 0xD8 && i <= 0xF6 -> Ok(#(char, tail)) + i if i >= 0xF8 && i <= 0x2FF -> Ok(#(char, tail)) + i if i >= 0x370 && i <= 0x37D -> Ok(#(char, tail)) + i if i >= 0x37F && i <= 0x1FFF -> Ok(#(char, tail)) + i if i >= 0x200C && i <= 0x200D -> Ok(#(char, tail)) + i if i >= 0x2070 && i <= 0x218F -> Ok(#(char, tail)) + i if i >= 0x2C00 && i <= 0x2FEF -> Ok(#(char, tail)) + i if i >= 0x3000 && i <= 0xD7FF -> Ok(#(char, tail)) + i if i >= 0xF900 && i <= 0xFDCF -> Ok(#(char, tail)) + i if i >= 0xFDF0 && i <= 0xFFFD -> Ok(#(char, tail)) + i if i >= 0x10000 && i <= 0xEFFFF -> Ok(#(char, tail)) + _ -> Error(Nil) + } + } + Error(_) -> Error(Nil) + } +} + +fn parse_name_char(doc: String) -> Result(#(String, String), Nil) { + case string.pop_grapheme(doc) { + Ok(#(":", tail)) -> Ok(#(":", tail)) + Ok(#("_", tail)) -> Ok(#("_", tail)) + Ok(#("-", tail)) -> Ok(#("-", tail)) + Ok(#(".", tail)) -> Ok(#(".", tail)) + Ok(#(char, tail)) -> { + let assert [codepoint] = string.to_utf_codepoints(char) + case string.utf_codepoint_to_int(codepoint) { + i if i >= 0x30 && i <= 0x39 -> Ok(#(char, tail)) + i if i == 0xB7 -> Ok(#(char, tail)) + i if i >= 0x41 && i <= 0x5A -> Ok(#(char, tail)) + i if i >= 0x61 && i <= 0x7A -> Ok(#(char, tail)) + i if i >= 0xC0 && i <= 0xD6 -> Ok(#(char, tail)) + i if i >= 0xD8 && i <= 0xF6 -> Ok(#(char, tail)) + i if i >= 0xF8 && i <= 0x37D -> Ok(#(char, tail)) + i if i >= 0x37F && i <= 0x1FFF -> Ok(#(char, tail)) + i if i >= 0x200C && i <= 0x200D -> Ok(#(char, tail)) + i if i >= 0x203F && i <= 0x2040 -> Ok(#(char, tail)) + i if i >= 0x2070 && i <= 0x218F -> Ok(#(char, tail)) + i if i >= 0x2C00 && i <= 0x2FEF -> Ok(#(char, tail)) + i if i >= 0x3000 && i <= 0xD7FF -> Ok(#(char, tail)) + i if i >= 0xF900 && i <= 0xFDCF -> Ok(#(char, tail)) + i if i >= 0xFDF0 && i <= 0xFFFD -> Ok(#(char, tail)) + i if i >= 0x10000 && i <= 0xEFFFF -> Ok(#(char, tail)) + _ -> Error(Nil) + } + } + Error(_) -> Error(Nil) + } +} + fn trim_space(doc: String) -> String { case parse_space(doc) { Ok(#(_, doc)) -> trim_space(doc)