diff --git a/src/glxml.gleam b/src/glxml.gleam index 5ff225c..dbed3b1 100644 --- a/src/glxml.gleam +++ b/src/glxml.gleam @@ -7,7 +7,12 @@ import gleam/result import gleam/string pub type Declaration { - Declaration(versioninfo: String, encoding: String, standalone: Bool) + XMLDecl(versioninfo: String, encoding: String, standalone: Bool) + GEntityDecl + PEntityDecl(name: String, decl: String) + ElementDecl + AttListDecl + NotationDecl } pub type Entity { @@ -16,8 +21,17 @@ pub type Entity { PublicExternalEntity(literal: String, pubidliteral: String) } +pub type ExternalID { + SystemID(system_literal: String) + PublicID(system_literal: String, public_literal: String) +} + pub type DocType { - DocType(name: String, entities: dict.Dict(String, Entity)) + DocType( + name: String, + external_id: Option(ExternalID), + entities: dict.Dict(String, Entity), + ) } pub type Document { @@ -387,7 +401,7 @@ fn process_reference( doctype: Option(DocType), ) -> Result(String, Nil) { case doctype { - Some(DocType(_, entities)) -> { + Some(DocType(_, _, entities)) -> { get_reference(entities, ref) } None -> { @@ -431,13 +445,249 @@ fn do_parse_name(doc: String, name: String) -> Result(#(String, String), Nil) { fn parse_prolog( doc: String, ) -> Result(#(Declaration, Option(DocType), String), Nil) { - let #(decl, doc) = case parse_decl(doc) { - Ok(#(decl, doc)) -> #(decl, doc) - _ -> #(Declaration("1.0", "UTF-8", False), doc) - } + let #(decl, doc) = + parse_decl(doc) |> result.unwrap(#(XMLDecl("1.0", "UTF-8", False), doc)) + let doc = parse_misc(doc) - Ok(#(decl, None, doc)) + let #(doctype, doc) = + parse_doctype(doc) + |> result.map(fn(d) { #(Some(d.0), d.1) }) + |> result.unwrap(#(None, doc)) + + Ok(#(decl, doctype, doc)) +} + +fn parse_doctype(doc: String) -> Result(#(DocType, String), Nil) { + case doc { + " tail -> { + let doc = trim_space(tail) + use #(name, doc) <- result.try(parse_name(doc)) + + let #(external_id, doc) = + parse_external_id(doc) |> result.unwrap(#(None, doc)) + + let doc = trim_space(doc) + + let #(int_subset, doc) = + parse_int_subset(doc) |> result.unwrap(#([], doc)) + + case doc { + ">" <> tail -> Ok(#(DocType(name, external_id, dict.new()), tail)) + _ -> Error(Nil) + } + } + _ -> Error(Nil) + } +} + +fn parse_int_subset(doc: String) -> Result(#(List(Declaration), String), Nil) { + let doc = trim_space(doc) + case doc { + "[" <> tail -> { + use #(decl_list, doc) <- result.try(do_parse_int_subset(tail, [])) + case doc { + "]" <> tail -> { + Ok(#(decl_list, tail)) + } + _ -> Error(Nil) + } + } + _ -> Error(Nil) + } +} + +fn do_parse_int_subset( + doc: String, + decl_list: List(Declaration), +) -> Result(#(List(Declaration), String), Nil) { + let doc = trim_space(doc) + case doc { + "%" <> tail -> { + use #(name, doc) <- result.try(parse_name(tail)) + case doc { + ";" <> tail -> { + case get_entity_replacement(name, decl_list) { + Some(decl) -> { + do_parse_int_subset(decl <> tail, decl_list) + } + _ -> Error(Nil) + } + } + _ -> Error(Nil) + } + } + _ -> { + todo + } + } +} + +fn get_entity_replacement( + entity: String, + decl_list: List(Declaration), +) -> Option(String) { + list.find_map(decl_list, fn(decl) { + case decl { + PEntityDecl(name, decl) if name == entity -> Ok(Some(decl)) + _ -> Error(Nil) + } + }) + |> result.unwrap(None) +} + +fn parse_external_id(doc: String) -> Result(#(Option(ExternalID), String), Nil) { + let doc = trim_space(doc) + case doc { + "SYSTEM" <> tail -> { + let doc = trim_space(tail) + use #(system_literal, doc) <- result.try(parse_system_literal( + doc, + None, + "", + )) + Ok(#(Some(SystemID(system_literal:)), doc)) + } + "PUBLIC" <> tail -> { + let doc = trim_space(tail) + use #(public_literal, doc) <- result.try(parse_public_literal( + doc, + None, + "", + )) + let doc = trim_space(doc) + use #(system_literal, doc) <- result.try(parse_system_literal( + doc, + None, + "", + )) + Ok(#(Some(PublicID(system_literal:, public_literal:)), doc)) + } + _ -> Error(Nil) + } +} + +fn parse_public_literal( + doc: String, + quote: Option(String), + literal: String, +) -> Result(#(String, String), Nil) { + case doc, quote { + "\"" as q <> tail, None | "'" as q <> tail, None -> + parse_public_literal(tail, Some(q), "") + "", _ -> Error(Nil) + _, None -> Error(Nil) + "\"" <> tail, Some("\"") -> Ok(#(literal, tail)) + "'" <> tail, Some("'") -> Ok(#(literal, tail)) + " " as char <> tail, Some(_) + | "\r" as char <> tail, Some(_) + | "\n" as char <> tail, Some(_) + | "0" as char <> tail, Some(_) + | "1" as char <> tail, Some(_) + | "2" as char <> tail, Some(_) + | "3" as char <> tail, Some(_) + | "4" as char <> tail, Some(_) + | "5" as char <> tail, Some(_) + | "6" as char <> tail, Some(_) + | "7" as char <> tail, Some(_) + | "8" as char <> tail, Some(_) + | "9" as char <> tail, Some(_) + | "a" as char <> tail, Some(_) + | "b" as char <> tail, Some(_) + | "c" as char <> tail, Some(_) + | "d" as char <> tail, Some(_) + | "e" as char <> tail, Some(_) + | "f" as char <> tail, Some(_) + | "g" as char <> tail, Some(_) + | "h" as char <> tail, Some(_) + | "i" as char <> tail, Some(_) + | "j" as char <> tail, Some(_) + | "k" as char <> tail, Some(_) + | "l" as char <> tail, Some(_) + | "m" as char <> tail, Some(_) + | "n" as char <> tail, Some(_) + | "o" as char <> tail, Some(_) + | "p" as char <> tail, Some(_) + | "q" as char <> tail, Some(_) + | "r" as char <> tail, Some(_) + | "s" as char <> tail, Some(_) + | "t" as char <> tail, Some(_) + | "u" as char <> tail, Some(_) + | "v" as char <> tail, Some(_) + | "w" as char <> tail, Some(_) + | "x" as char <> tail, Some(_) + | "y" as char <> tail, Some(_) + | "z" as char <> tail, Some(_) + | "A" as char <> tail, Some(_) + | "B" as char <> tail, Some(_) + | "C" as char <> tail, Some(_) + | "D" as char <> tail, Some(_) + | "E" as char <> tail, Some(_) + | "F" as char <> tail, Some(_) + | "G" as char <> tail, Some(_) + | "H" as char <> tail, Some(_) + | "I" as char <> tail, Some(_) + | "J" as char <> tail, Some(_) + | "K" as char <> tail, Some(_) + | "L" as char <> tail, Some(_) + | "M" as char <> tail, Some(_) + | "N" as char <> tail, Some(_) + | "O" as char <> tail, Some(_) + | "P" as char <> tail, Some(_) + | "Q" as char <> tail, Some(_) + | "R" as char <> tail, Some(_) + | "S" as char <> tail, Some(_) + | "T" as char <> tail, Some(_) + | "U" as char <> tail, Some(_) + | "V" as char <> tail, Some(_) + | "W" as char <> tail, Some(_) + | "X" as char <> tail, Some(_) + | "Y" as char <> tail, Some(_) + | "Z" as char <> tail, Some(_) + | "-" as char <> tail, Some(_) + | "(" as char <> tail, Some(_) + | ")" as char <> tail, Some(_) + | "+" as char <> tail, Some(_) + | "," as char <> tail, Some(_) + | "." as char <> tail, Some(_) + | "/" as char <> tail, Some(_) + | ":" as char <> tail, Some(_) + | "=" as char <> tail, Some(_) + | "?" as char <> tail, Some(_) + | ";" as char <> tail, Some(_) + | "!" as char <> tail, Some(_) + | "*" as char <> tail, Some(_) + | "#" as char <> tail, Some(_) + | "@" as char <> tail, Some(_) + | "$" as char <> tail, Some(_) + | "_" as char <> tail, Some(_) + | "%" as char <> tail, Some(_) + | "'" as char <> tail, Some("\"") + -> { + parse_public_literal(tail, quote, literal <> char) + } + _, _ -> Error(Nil) + } +} + +fn parse_system_literal( + doc: String, + quote: Option(String), + literal: String, +) -> Result(#(String, String), Nil) { + case doc, quote { + "\"" as q <> tail, None | "'" as q <> tail, None -> + parse_system_literal(tail, Some(q), "") + "", _ -> Error(Nil) + _, None -> Error(Nil) + "\"" <> tail, Some("\"") -> Ok(#(literal, tail)) + "'" <> tail, Some("'") -> Ok(#(literal, tail)) + _, _ -> { + let assert Ok(#(char, tail)) = string.pop_grapheme(doc) + + parse_system_literal(tail, quote, literal <> char) + } + } } fn parse_misc(doc: String) -> String { @@ -463,18 +713,14 @@ fn parse_decl(doc: String) -> Result(#(Declaration, String), Nil) { case doc { " tail -> { use #(versioninfo, doc) <- result.try(parse_versioninfo(tail)) - let #(encoding, doc) = case parse_encodingdecl(doc) { - Ok(e) -> e - Error(_) -> #("", doc) - } - let #(standalone, doc) = case parse_standalone(doc) { - Ok(e) -> e - Error(_) -> #(False, doc) - } + let #(encoding, doc) = + parse_encodingdecl(doc) |> result.unwrap(#("", doc)) + let #(standalone, doc) = + parse_standalone(doc) |> result.unwrap(#(False, doc)) case trim_space(doc) { "?>" <> tail -> - Ok(#(Declaration(versioninfo:, encoding:, standalone:), tail)) + Ok(#(XMLDecl(versioninfo:, encoding:, standalone:), tail)) _ -> Error(Nil) } }