import gleam/bool import gleam/dict import gleam/int import gleam/list import gleam/option.{type Option, None, Some} import gleam/result import gleam/string pub type Declaration { Declaration(versioninfo: String, encoding: String, standalone: Bool) } pub type Entity { InternalEntity(value: String) SystemExternalEntity(literal: String) PublicExternalEntity(literal: String, pubidliteral: String) } pub type DocType { DocType(name: String, entities: dict.Dict(String, Entity)) } pub type Document { Document( decl: Declaration, doctype: Option(DocType), element: Option(Element), ) } pub type Attribute { Attribute(name: String, value: String) } pub type Element { EmptyElem(name: String, attrs: List(Attribute)) Element(name: String, attrs: List(Attribute), elements: List(Element)) Text(content: String) Comment(content: String) CData(content: String) PI(name: String, content: String) Whitespace } pub fn main() { parse_document( "\r\n \n]]>", ) |> echo } pub fn default_entities() -> dict.Dict(String, Entity) { dict.from_list([ #("lt", InternalEntity("<")), #("gt", InternalEntity(">")), #("amp", InternalEntity("&")), #("apos", InternalEntity("'")), #("quot", InternalEntity(""")), ]) } fn parse_document(doc: String) -> Result(Document, Nil) { use #(decl, doctype, doc) <- result.try(parse_prolog(doc)) use <- bool.guard(when: doc == "", return: Ok(Document(decl, doctype, None))) use #(element, doc) <- result.try(parse_element(doc, doctype)) let doc = parse_misc(doc) case doc { "" -> Ok(Document(decl, doctype, Some(element))) _ -> Error(Nil) } } fn parse_element( doc: String, doctype: Option(DocType), ) -> Result(#(Element, String), Nil) { try_parsers( [parse_empty_elem(_, doctype), parse_tagged_elem(_, doctype)], doc, ) } fn parse_empty_elem( doc: String, doctype: Option(DocType), ) -> Result(#(Element, String), Nil) { case doc { "<" <> tail -> { use #(name, doc) <- result.try(parse_name(tail)) use #(attrs, doc) <- result.try(parse_attributes(doc, doctype, [])) let doc = trim_space(doc) case doc { "/>" <> tail -> Ok(#(EmptyElem(name, attrs), tail)) _ -> Error(Nil) } } _ -> Error(Nil) } } fn parse_tagged_elem( doc: String, doctype: Option(DocType), ) -> Result(#(Element, String), Nil) { case doc { "<" <> tail -> { use #(name, doc) <- result.try(parse_name(tail)) use #(attrs, doc) <- result.try(parse_attributes(doc, doctype, [])) let doc = trim_space(doc) case doc { ">" <> tail -> { use #(content, doc) <- result.try(parse_content(tail, doctype, [])) case doc { " tail -> { use #(close_name, doc) <- result.try(parse_name(tail)) let doc = trim_space(doc) case doc { ">" <> tail -> { use <- bool.guard( when: name != close_name, return: Error(Nil), ) Ok(#(Element(name, attrs, content), tail)) } _ -> Error(Nil) } } _ -> Error(Nil) } } _ -> Error(Nil) } } _ -> Error(Nil) } } fn parse_content( doc: String, doctype: Option(DocType), content: List(Element), ) -> Result(#(List(Element), String), Nil) { use #(chardata, doc) <- result.try(parse_chardata(doc, doctype, "")) let new_content = case chardata { "" -> content _ -> [Text(chardata), ..content] } case doc { "<" <> _ -> { case try_parsers( [ parse_element(_, doctype), parse_comment, parse_cdata, parse_pi, ], doc, ) { Ok(#(element, doc)) -> parse_content(doc, doctype, [element, ..content]) Error(_) -> Ok(#(list.reverse(new_content), doc)) } } _ -> Ok(#(list.reverse(new_content), doc)) } } fn parse_pi(doc: String) -> Result(#(Element, String), Nil) { case doc { " tail -> { use #(name, doc) <- result.try(parse_name(tail)) use <- bool.guard( when: string.lowercase(name) == "xml", return: Error(Nil), ) let doc = trim_space(doc) use #(content, doc) <- result.try(parse_pi_content(doc, "")) case doc { "?>" <> tail -> Ok(#(PI(name, content), tail)) _ -> Error(Nil) } } _ -> Error(Nil) } } fn parse_pi_content(doc: String, pi: String) -> Result(#(String, String), Nil) { case doc { "?>" <> _ -> Ok(#(pi, doc)) "" -> Error(Nil) _ -> { case parse_char(doc) { Ok(#(char, doc)) -> parse_pi_content(doc, pi <> char) Error(_) -> Ok(#(pi, doc)) } } } } fn parse_cdata(doc: String) -> Result(#(Element, String), Nil) { case doc { " tail -> { use #(cdata, doc) <- result.try(do_parse_cdata(tail, "")) Ok(#(CData(cdata), doc)) } _ -> Error(Nil) } } fn do_parse_cdata(doc: String, cdata: String) -> Result(#(String, String), Nil) { case doc { "]]>" <> tail -> Ok(#(cdata, tail)) "" -> Error(Nil) _ -> { case parse_char(doc) { Ok(#(char, doc)) -> do_parse_cdata(doc, cdata <> char) _ -> Error(Nil) } } } } fn parse_chardata( doc: String, doctype: Option(DocType), chardata: String, ) -> Result(#(String, String), Nil) { case doc { "]]>" <> _ -> Error(Nil) "<" <> _ -> Ok(#(chardata, doc)) "&" <> _ -> { use #(refval, doc) <- result.try(parse_reference(doc, doctype)) parse_chardata(doc, doctype, chardata <> refval) } "" -> Ok(#("", "")) _ -> { let assert Ok(#(char, tail)) = string.pop_grapheme(doc) parse_chardata(tail, doctype, chardata <> char) } } } fn parse_attributes( doc: String, doctype: Option(DocType), attrs: List(Attribute), ) -> Result(#(List(Attribute), String), Nil) { case parse_attribute(doc, doctype) { Ok(#(attr, doc)) -> parse_attributes(doc, doctype, [attr, ..attrs]) Error(_) -> Ok(#(list.reverse(attrs), doc)) } } fn parse_attribute( doc: String, doctype: Option(DocType), ) -> Result(#(Attribute, String), Nil) { let doc = trim_space(doc) use #(name, doc) <- result.try(parse_name(doc)) case doc { "=" <> tail -> { case tail { "\"" <> tail -> { let #(value, doc) = parse_multiple_optional( tail, try_parsers( [ fn(doc) { case string.pop_grapheme(doc) { Ok(#(char, _doc)) if char == "<" || char == "&" || char == "\"" -> Error(Nil) Ok(#(char, doc)) -> Ok(#(char, doc)) Error(_) -> Error(Nil) } }, parse_reference(_, doctype), ], _, ), "", ) case doc { "\"" <> tail -> Ok(#(Attribute(name, value), tail)) _ -> Error(Nil) } } "'" <> tail -> { let #(value, doc) = parse_multiple_optional( tail, try_parsers( [ fn(doc) { case string.pop_grapheme(doc) { Ok(#(char, _doc)) if char == "<" || char == "&" || char == "'" -> Error(Nil) Ok(#(char, doc)) -> Ok(#(char, doc)) Error(_) -> Error(Nil) } }, parse_reference(_, doctype), ], _, ), "", ) case doc { "'" <> tail -> Ok(#(Attribute(name, value), tail)) _ -> Error(Nil) } } _ -> Error(Nil) } } _ -> Error(Nil) } } fn parse_reference( doc: String, doctype: Option(DocType), ) -> Result(#(String, String), Nil) { case doc { "&#" <> tail -> { case tail { "x" <> tail -> { use #(digits, doc) <- result.try(parse_multiple(tail, parse_hex_digit)) case doc { ";" <> tail -> { use value <- result.try(int.base_parse(digits, 16)) use codepoint <- result.try(string.utf_codepoint(value)) Ok(#(string.from_utf_codepoints([codepoint]), tail)) } _ -> Error(Nil) } } _ -> { use #(digits, doc) <- result.try(parse_multiple(tail, parse_digit)) case doc { ";" <> tail -> { use value <- result.try(int.base_parse(digits, 10)) use codepoint <- result.try(string.utf_codepoint(value)) Ok(#(string.from_utf_codepoints([codepoint]), tail)) } _ -> Error(Nil) } } } } "&" <> tail -> { use #(name, doc) <- result.try(parse_name(tail)) case doc { ";" <> tail -> { use value <- result.try(process_reference(name, doctype)) Ok(#("", value <> tail)) } _ -> Error(Nil) } } _ -> Error(Nil) } } fn process_reference( ref: String, doctype: Option(DocType), ) -> Result(String, Nil) { case doctype { Some(DocType(_, entities)) -> { get_reference(entities, ref) } None -> { get_reference(default_entities(), ref) } } } fn get_reference( entities: dict.Dict(String, Entity), ref: String, ) -> Result(String, Nil) { case dict.get(entities, ref) { Ok(InternalEntity(val)) -> Ok(val) Ok(PublicExternalEntity(_, _)) | Ok(SystemExternalEntity(_)) -> Error(Nil) Error(_) -> { case entities == default_entities() { True -> Error(Nil) False -> get_reference(default_entities(), ref) } } } } fn parse_name(doc: String) -> Result(#(String, String), Nil) { case parse_name_start_char(doc) { Ok(#(char, tail)) -> { do_parse_name(tail, char) } Error(_) -> Error(Nil) } } fn do_parse_name(doc: String, name: String) -> Result(#(String, String), Nil) { case parse_name_char(doc) { Ok(#(char, tail)) -> do_parse_name(tail, name <> char) Error(_) -> Ok(#(name, doc)) } } fn parse_prolog( doc: String, ) -> Result(#(Declaration, Option(DocType), String), Nil) { let #(decl, doc) = case parse_decl(doc) { Ok(#(decl, doc)) -> #(decl, doc) _ -> #(Declaration("1.0", "UTF-8", False), doc) } let doc = parse_misc(doc) Ok(#(decl, None, doc)) } fn parse_misc(doc: String) -> String { case try_parsers( [ parse_comment, fn(doc) { parse_space(doc) |> result.map(fn(sp) { #(Whitespace, sp.1) }) }, parse_pi, ], doc, ) { Ok(#(_element, doc)) -> parse_misc(doc) Error(Nil) -> doc } } fn parse_decl(doc: String) -> Result(#(Declaration, String), Nil) { case doc { " tail -> { use #(versioninfo, doc) <- result.try(parse_versioninfo(tail)) let #(encoding, doc) = case parse_encodingdecl(doc) { Ok(e) -> e Error(_) -> #("", doc) } let #(standalone, doc) = case parse_standalone(doc) { Ok(e) -> e Error(_) -> #(False, doc) } case trim_space(doc) { "?>" <> tail -> Ok(#(Declaration(versioninfo:, encoding:, standalone:), tail)) _ -> Error(Nil) } } _ -> Error(Nil) } } fn parse_versioninfo(doc: String) -> Result(#(String, String), Nil) { case trim_space(doc) { "version=" <> tail -> { use #(version, doc) <- result.try(parse_version(tail)) Ok(#(version, doc)) } _ -> Error(Nil) } } fn parse_version(doc: String) -> Result(#(String, String), Nil) { case doc { "\"1." <> tail -> { use #(version, doc) <- result.try(do_parse_version(tail, "1.")) case doc { "\"" <> tail -> Ok(#(version, tail)) _ -> Error(Nil) } } "'1." <> tail -> { use #(version, doc) <- result.try(do_parse_version(tail, "1.")) case doc { "'" <> tail -> Ok(#(version, tail)) _ -> Error(Nil) } } _ -> Error(Nil) } } fn do_parse_version( doc: String, version: String, ) -> Result(#(String, String), Nil) { case parse_digit(doc) { Ok(#(digit, doc)) -> do_parse_version(doc, version <> digit) Error(_) if version == "" -> Error(Nil) Error(_) -> Ok(#(version, doc)) } } fn parse_encodingdecl(doc: String) -> Result(#(String, String), Nil) { case trim_space(doc) { "encoding=" <> tail -> { case tail { "\"" <> tail -> { use #(encoding, doc) <- result.try(parse_encoding(tail)) case doc { "\"" <> tail -> Ok(#(encoding, tail)) _ -> Error(Nil) } } "'" <> tail -> { use #(encoding, doc) <- result.try(parse_encoding(tail)) case doc { "'" <> tail -> Ok(#(encoding, tail)) _ -> Error(Nil) } } _ -> Error(Nil) } } _ -> Error(Nil) } } fn parse_encoding(doc: String) -> Result(#(String, String), Nil) { case parse_alpha(doc) { Ok(#(char, doc)) -> { Ok(parse_multiple_optional( doc, try_parsers( [ parse_alpha, parse_digit, fn(doc) { case doc { "." as char <> tail | "_" as char <> tail | "-" as char <> tail -> Ok(#(char, tail)) _ -> Error(Nil) } }, ], _, ), char, )) } Error(_) -> Error(Nil) } } fn parse_standalone(doc: String) -> Result(#(Bool, String), Nil) { case trim_space(doc) { "standalone=\"yes\"" <> tail | "standalone='yes'" <> tail -> Ok(#(True, tail)) "standalone=\"no\"" <> tail | "standalone='no'" <> tail -> Ok(#(True, tail)) _ -> Error(Nil) } } fn parse_digit(doc: String) -> Result(#(String, String), Nil) { case doc { "0" as digit <> tail | "1" as digit <> tail | "2" as digit <> tail | "3" as digit <> tail | "4" as digit <> tail | "5" as digit <> tail | "6" as digit <> tail | "7" as digit <> tail | "8" as digit <> tail | "9" as digit <> tail -> Ok(#(digit, tail)) _ -> Error(Nil) } } pub fn parse_hex_digit(str: String) -> Result(#(String, String), Nil) { case str { "0" as digit <> tail | "1" as digit <> tail | "2" as digit <> tail | "3" as digit <> tail | "4" as digit <> tail | "5" as digit <> tail | "6" as digit <> tail | "7" as digit <> tail | "8" as digit <> tail | "9" as digit <> tail | "a" as digit <> tail | "b" as digit <> tail | "c" as digit <> tail | "d" as digit <> tail | "e" as digit <> tail | "f" as digit <> tail | "A" as digit <> tail | "B" as digit <> tail | "C" as digit <> tail | "D" as digit <> tail | "E" as digit <> tail | "F" as digit <> tail -> Ok(#(digit, tail)) _ -> Error(Nil) } } fn parse_alpha(doc: String) -> Result(#(String, String), Nil) { case doc { "a" as char <> tail | "b" as char <> tail | "c" as char <> tail | "d" as char <> tail | "e" as char <> tail | "f" as char <> tail | "g" as char <> tail | "h" as char <> tail | "i" as char <> tail | "j" as char <> tail | "k" as char <> tail | "l" as char <> tail | "m" as char <> tail | "n" as char <> tail | "o" as char <> tail | "p" as char <> tail | "q" as char <> tail | "r" as char <> tail | "s" as char <> tail | "t" as char <> tail | "u" as char <> tail | "v" as char <> tail | "w" as char <> tail | "x" as char <> tail | "y" as char <> tail | "z" as char <> tail | "A" as char <> tail | "B" as char <> tail | "C" as char <> tail | "D" as char <> tail | "E" as char <> tail | "F" as char <> tail | "G" as char <> tail | "H" as char <> tail | "I" as char <> tail | "J" as char <> tail | "K" as char <> tail | "L" as char <> tail | "M" as char <> tail | "N" as char <> tail | "O" as char <> tail | "P" as char <> tail | "Q" as char <> tail | "R" as char <> tail | "S" as char <> tail | "T" as char <> tail | "U" as char <> tail | "V" as char <> tail | "W" as char <> tail | "X" as char <> tail | "Y" as char <> tail | "Z" as char <> tail -> Ok(#(char, tail)) _ -> Error(Nil) } } fn parse_comment(doc: String) -> Result(#(Element, String), Nil) { case doc { "" <> tail -> Ok(#(Comment(comment), tail)) _ -> Error(Nil) } } _ -> Error(Nil) } } fn do_parse_comment(doc: String) -> #(String, String) { parse_multiple_optional( doc, try_parsers( [ parse_char_except_dash, fn(doc) { case doc { "-" <> tail -> { use #(char, doc) <- result.try(parse_char_except_dash(tail)) Ok(#("-" <> char, doc)) } _ -> Error(Nil) } }, ], _, ), "", ) } fn parse_char_except_dash(doc: String) -> Result(#(String, String), Nil) { case doc { "-" <> _ -> Error(Nil) _ -> parse_char(doc) } } fn parse_char(doc: String) -> Result(#(String, String), Nil) { case string.pop_grapheme(doc) { Ok(#("\r\n", tail)) -> Ok(#("\r\n", tail)) Ok(#("\t", tail)) -> Ok(#("\t", tail)) Ok(#("\n", tail)) -> Ok(#("\n", tail)) Ok(#("\r", tail)) -> Ok(#("\r", tail)) Ok(#(char, tail)) -> { let assert [codepoint] = string.to_utf_codepoints(char) case string.utf_codepoint_to_int(codepoint) { i if i >= 0x20 && i <= 0xD7FF -> Ok(#(char, tail)) i if i >= 0xE000 && i <= 0xFFFD -> Ok(#(char, tail)) i if i >= 0x10000 && i <= 0x10FFFF -> Ok(#(char, tail)) _ -> Error(Nil) } } Error(_) -> Error(Nil) } } fn parse_name_start_char(doc: String) -> Result(#(String, String), Nil) { case string.pop_grapheme(doc) { Ok(#(":", tail)) -> Ok(#(":", tail)) Ok(#("_", tail)) -> Ok(#("_", tail)) Ok(#(char, tail)) -> { let assert [codepoint] = string.to_utf_codepoints(char) case string.utf_codepoint_to_int(codepoint) { i if i >= 0x41 && i <= 0x5A -> Ok(#(char, tail)) i if i >= 0x61 && i <= 0x7A -> Ok(#(char, tail)) i if i >= 0xC0 && i <= 0xD6 -> Ok(#(char, tail)) i if i >= 0xD8 && i <= 0xF6 -> Ok(#(char, tail)) i if i >= 0xF8 && i <= 0x2FF -> Ok(#(char, tail)) i if i >= 0x370 && i <= 0x37D -> Ok(#(char, tail)) i if i >= 0x37F && i <= 0x1FFF -> Ok(#(char, tail)) i if i >= 0x200C && i <= 0x200D -> Ok(#(char, tail)) i if i >= 0x2070 && i <= 0x218F -> Ok(#(char, tail)) i if i >= 0x2C00 && i <= 0x2FEF -> Ok(#(char, tail)) i if i >= 0x3000 && i <= 0xD7FF -> Ok(#(char, tail)) i if i >= 0xF900 && i <= 0xFDCF -> Ok(#(char, tail)) i if i >= 0xFDF0 && i <= 0xFFFD -> Ok(#(char, tail)) i if i >= 0x10000 && i <= 0xEFFFF -> Ok(#(char, tail)) _ -> Error(Nil) } } Error(_) -> Error(Nil) } } fn parse_name_char(doc: String) -> Result(#(String, String), Nil) { case string.pop_grapheme(doc) { Ok(#(":", tail)) -> Ok(#(":", tail)) Ok(#("_", tail)) -> Ok(#("_", tail)) Ok(#("-", tail)) -> Ok(#("-", tail)) Ok(#(".", tail)) -> Ok(#(".", tail)) Ok(#(char, tail)) -> { let assert [codepoint] = string.to_utf_codepoints(char) case string.utf_codepoint_to_int(codepoint) { i if i >= 0x30 && i <= 0x39 -> Ok(#(char, tail)) i if i == 0xB7 -> Ok(#(char, tail)) i if i >= 0x41 && i <= 0x5A -> Ok(#(char, tail)) i if i >= 0x61 && i <= 0x7A -> Ok(#(char, tail)) i if i >= 0xC0 && i <= 0xD6 -> Ok(#(char, tail)) i if i >= 0xD8 && i <= 0xF6 -> Ok(#(char, tail)) i if i >= 0xF8 && i <= 0x37D -> Ok(#(char, tail)) i if i >= 0x37F && i <= 0x1FFF -> Ok(#(char, tail)) i if i >= 0x200C && i <= 0x200D -> Ok(#(char, tail)) i if i >= 0x203F && i <= 0x2040 -> Ok(#(char, tail)) i if i >= 0x2070 && i <= 0x218F -> Ok(#(char, tail)) i if i >= 0x2C00 && i <= 0x2FEF -> Ok(#(char, tail)) i if i >= 0x3000 && i <= 0xD7FF -> Ok(#(char, tail)) i if i >= 0xF900 && i <= 0xFDCF -> Ok(#(char, tail)) i if i >= 0xFDF0 && i <= 0xFFFD -> Ok(#(char, tail)) i if i >= 0x10000 && i <= 0xEFFFF -> Ok(#(char, tail)) _ -> Error(Nil) } } Error(_) -> Error(Nil) } } fn trim_space(doc: String) -> String { case parse_space(doc) { Ok(#(_, doc)) -> trim_space(doc) Error(_) -> doc } } fn parse_space(doc: String) -> Result(#(String, String), Nil) { case doc { " " as ws <> tail | "\t" as ws <> tail | "\n" as ws <> tail | "\r" as ws <> tail -> Ok(#(ws, tail)) _ -> Error(Nil) } } fn try_parsers( over list: List(fn(String) -> Result(#(a, String), Nil)), against static_data: String, ) -> Result(#(a, String), Nil) { case list { [] -> Error(Nil) [first, ..rest] -> case first(static_data) { Error(_) -> try_parsers(rest, static_data) Ok(r) -> Ok(r) } } } pub fn parse_multiple( to_parse str: String, with to_run: fn(String) -> Result(#(String, String), Nil), ) -> Result(#(String, String), Nil) { case parse_multiple_optional(str, to_run, "") { #("", _) -> Error(Nil) #(r, rest) -> Ok(#(r, rest)) } } fn parse_multiple_optional( to_parse str: String, with to_run: fn(String) -> Result(#(String, String), Nil), acc ret: String, ) -> #(String, String) { case str { "" -> #(ret, str) _ -> case to_run(str) { Ok(#(r, rest)) -> parse_multiple_optional(rest, to_run, ret <> r) Error(_) -> #(ret, str) } } }