123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314(*
Copyright 2009, 2010, 2011, 2012, 2013, 2014, 2015 Anton Lavrik
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*)(* XML format parsing and generation using Xmlm library:
*
* http://erratique.ch/software/xmlm
*
* Details about how Xmlm parses XML are available here:
*
* http://erratique.ch/software/xmlm/doc/Xmlm
*)moduleC=Piqi_commonopenC.Stdtypexml=Piqi_xml_type.xmltypexml_elem=Piqi_xml_type.xml_elemtypexml_parser={input:Xmlm.input;fname:string;(* name of the file *)}letinit_xml_parser?(fname="input")source:xml_parser=(* don't strip whitespace in CDATA and expect UTF-8 input (no other encodings
* are supported by Piqi)
*
* NOTE: according to Xmlm documentation, even when we specify ~strip:false,
* "all kinds of line ends are translated to the newline character (U+000A)"
*
* NOTE: we use a custom whitespace stripper below that doesn't strip leading
* and trailing whitespace in text nodes.
*)letinput=Xmlm.make_inputsource~enc:(Some`UTF_8)~strip:falsein{input=input;fname=fname;}letinit_from_channel?fnamech=letsource=`Channelchininit_xml_parsersource?fnameletinit_from_string?fnames=letsource=`String(0,s)ininit_xml_parsersource?fname(* XML input *)(* custom whitespace stripper, that srips only formatting whitespace and leaves
* text nodes untouched *)letstrip_whitespace(l:xmllist)=matchlwith|[(`Data_)]->l|_->(* there is at least one element in the list; stripping all the data around
* and between the elements *)List.filter(function`Elem_->true|`Data_->false)lletdo_read_xml_objxml_parser:xml=letmake_loc(line,col)=(xml_parser.fname,line,col)in(* below are cusomized versions of Xmlm.input_tree and Xmlm.input_doc_tree
* functions that capture accurate information about location of elements and
* data in the input stream *)letinput_tree~el~datai=letrecauxtagscontext=letpos=Xmlm.posiinmatchXmlm.inputiwith|`El_starttag->aux((pos,tag)::tags)([]::context)|`El_end->beginmatchtags,contextwith|(pos,tag)::tags',childs::context'->letel=elpostag(List.revchilds)inbeginmatchcontext'with|parent::context''->auxtags'((el::parent)::context'')|[]->elend|_->assertfalseend|`Datad->beginmatchcontextwith|childs::context'->auxtags(((dataposd)::childs)::context')|[]->assertfalseend|`Dtd_->assertfalseinaux[][]inletinput_doc_tree~el~datai=letpos=Xmlm.posiinmatchXmlm.inputiwith|`Dtdd->d,input_tree~el~datai|_->C.error_at(make_locpos)"invalid XML header"inletelpostagcontents=let(ns,name),attr=taginletcontents=strip_whitespacecontentsinletloc=make_locposin(* check that there is no namespace and no attributes *)ifns<>""thenC.error_atloc"namespaces are not allowed in XML element names";ifattr<>[]thenC.error_atloc"attributes are not allowed in XML elements";letxml_elem=(name,contents)inletres=`Elemxml_elemin(* add information about term locations to the location database *)Piqloc.addloclocname;Piqloc.addloclocxml_elem;Piqloc.addloclocres;resinletdataposd=letres=`Datadin(* add information about term locations to the location database *)letloc=make_locposinPiqloc.addloclocd;Piqloc.addloclocres;resintrylet_dtd,xml=input_doc_tree~el~dataxml_parser.inputinxmlwithXmlm.Error(pos,err)->letloc=make_locposinleterrstr=Xmlm.error_messageerrinC.error_atlocerrstrletread_xml_obj(xml_parser:xml_parser):xmloption=letis_eoi=tryXmlm.eoixml_parser.inputwith|Xmlm.Error(_pos,`Unexpected_eoi)->(* raised on a completely empty input *)true|Xmlm.Error((line,col),err)->letloc=xml_parser.fname,line,colinleterrstr=Xmlm.error_messageerrinC.error_atlocerrstrinifis_eoithenNoneelseletxml=do_read_xml_objxml_parserinSomexml(* XML output
*
* We use 2-space indentation and output a newline character after the root
* element.
*
* We do not use Xmlm's library indentation mode (although it would've been so
* convenient), because it inserts indentation around text nodes which leads to
* extra whitespace. This means we will get extra whitespace when we read
* serialized values of primitive types back. And this is not what we want for
* data serialization purposes.
*
* For this reason, we do indentation ourselves using `Data elements filled with
* newlines and whitespace. This way we generate indented XML and still can read
* it back reliably. Other XML serializers may behave differently, but we don't
* really care as we can set our own rules in this case.
*)(* helpers *)letws=`Data" "(* 2 spaces indentation *)letnl=`Data"\n"(* newline *)(* build a list of [ ws; ... ws ] :: l where the number of ws nodes is
* determined by the depth parameter *)letindent_listdepthl=letrecauxdepthaccu=ifdepth=0thenaccuelseaux(depth-1)(ws::accu)inauxdepthl(* rewrite xml tree to inject indentation represented as `Data elements
* containing either whitespace or newlines *)letindent_tree(xml:xml):xml=letrecauxdepthnode=matchnodewith|`Data_(* don't indent data elements *)|`Elem(_,[`Data_])|`Elem(_,[])->node(* nothing to indent inside empty element *)|`Elem(name,contents)->(* non-empty non-data contents *)letl=List.fold_left(funaccux->letx=aux(depth+1)xin(* indent the sub-tree *)indent_list(depth+1)(x::nl::accu))(indent_listdepth[])(* indent right before the closing tag *)(List.revcontents)inletcontents=nl::lin(* newline after the opening tag *)`Elem(name,contents)inaux0xmlletgen_xml?(pretty_print=true)?(nl=false)?(decl=true)dest(xml:xml)=letfrag=function(* xml to Xmlm.frag converter *)|`Datax->`Datax|`Elem(name,contents)->lettag=("",name),[]in(* no namespace, no attributes *)`El(tag,contents)inletxml=ifpretty_printthenindent_treexmlelsexmlinletoutput=Xmlm.make_outputdest~nl~declinletdtd=NoneinXmlm.output_doc_treefragoutput(dtd,xml)letxml_to_buffer?pretty_print?declbufxml=letdest=`Bufferbufingen_xmldestxml?pretty_print?declletxml_to_channel?pretty_printchxml=letdest=`Channelchin(* output a newline character after the root element so that the file ends
* with a newline *)gen_xmldestxml?pretty_print~nl:trueletxml_to_string?pretty_print?declxml=letbuf=Buffer.create256inxml_to_bufferbufxml?pretty_print?decl;Buffer.contentsbuf(* for internal use only: read one parsed XML value from its string
* representation *)letxml_of_strings:xmllist=letxml_parser=init_from_stringsinletres=tryread_xml_objxml_parserwithC.Error((_,lnum',cnum'),error)->(* string location can be missing when we parse from XML embedded in
* Protobuf *)let(fname,lnum,cnum)=tryPiqloc.findswithNot_found->("embedded",1,-1)in(* adjust location column number: add the original column number of the
* '#' character + 1 for the space that follows it; note that this method
* doesn't give 100% guarantee that the offset is correct, but it is
* accurate if all the text literal lines start at the same column *)letloc=(fname,lnum+lnum'-1,cnum+cnum'+1)inC.error_atloc("error parsing embedded XML: "^error)in(* TODO: check for trailing XML data -- there shouldn't be any after this
* object we've just read *)matchreswith|Some(`Elem(_name,xml_list))->(* as in other places, e.g. Piqobj_of_xml, we ignore the top-level
* elemnt's name *)xml_list|Somexml->(* this should never happen, because the xml parser always returns
* top-level element *)C.errorxml"XML root element expected"|None->C.errors"string doesn't have XML data"let_=(* pretty print and skip <?xml ...> declaration *)Piqobj.string_of_xml:=(funx->xml_to_stringx~pretty_print:true~decl:false);(* parse xml from string while not expecting <?xml ...> declaration *)Piqobj.xml_of_string:=(funx->xml_of_stringx)