123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293(*
* Copyright (c) 2014, OCaml.org project
* Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*)typet={title:string;link:Uri.toption;date:Syndic.Date.toption;feed:Feed.t;author:string;email:string;content:Nethtml.documentlist;mutablelink_response:(string,string)resultoption;}letreclen_prefix_of_htmlhtmllen=iflen<=0then(0,[])elsematchhtmlwith|[]->(len,[])|el::tl->letlen,prefix_el=len_prefix_of_elelleninletlen,prefix_tl=len_prefix_of_htmltllenin(len,prefix_el::prefix_tl)andlen_prefix_of_elellen=matchelwith|Nethtml.Datad->letlen'=len-String.lengthdin(len',iflen'>=0thenelelseData(String.subd0len^"…"))|Nethtml.Element(tag,args,content)->(* Remove "id" and "name" to avoid duplicate anchors with the whole
post. *)letargs=List.filter(fun(n,_)->n<>"id"&&n<>"name")argsinletlen,prefix_content=len_prefix_of_htmlcontentlenin(len,Element(tag,args,prefix_content))letprefix_of_htmlhtmllen=snd(len_prefix_of_htmlhtmllen)letrecfilter_maplf=matchlwith|[]->[]|a::tl->(matchfawithNone->filter_maptlf|Somea->a::filter_maptlf)letencode_html=Netencoding.Html.encode~prefer_name:false~in_enc:`Enc_utf8()letdecode_documenthtml=Nethtml.decode~enc:`Enc_utf8htmlletencode_documenthtml=Nethtml.encode~enc:`Enc_utf8htmlletrecresolve?xmlbasehtml=List.map(resolve_links_el~xmlbase)htmlandresolve_links_el~xmlbase=function|Nethtml.Element("a",attrs,sub)->letattrs=matchList.partition(fun(t,_)->t="href")attrswith|[],_->attrs|(_,h)::_,attrs->letsrc=Uri.to_string(Syndic.XML.resolve~xmlbase(Uri.of_stringh))in("href",src)::attrsinNethtml.Element("a",attrs,resolve?xmlbasesub)|Nethtml.Element("img",attrs,sub)->letattrs=matchList.partition(fun(t,_)->t="src")attrswith|[],_->attrs|(_,src)::_,attrs->letsrc=Uri.to_string(Syndic.XML.resolve~xmlbase(Uri.of_stringsrc))in("src",src)::attrsinNethtml.Element("img",attrs,sub)|Nethtml.Element(e,attrs,sub)->Nethtml.Element(e,attrs,resolve?xmlbasesub)|Data_asd->d(* Things that posts should not contain *)letundesired_tags=["style";"script"]letundesired_attr=["id"]letremove_undesired_attr=List.filter(fun(a,_)->not(List.memaundesired_attr))letrecremove_undesired_tagshtml=filter_maphtmlremove_undesired_tags_elandremove_undesired_tags_el=function|Nethtml.Element(t,a,sub)->ifList.memtundesired_tagsthenNoneelseSome(Nethtml.Element(t,remove_undesired_attra,remove_undesired_tagssub))|Data_asd->Somedletrelaxed_html40_dtd=(* Allow <font> inside <pre> because blogspot uses it! :-( *)letconstr=`Sub_exclusions(["img";"object";"applet";"big";"small";"sub";"sup";"basefont"],`Inline)inletdtd=Nethtml.relaxed_html40_dtdin("pre",(`Block,constr))::List.remove_assoc"pre"dtdlethtml_of_text?xmlbases=tryNethtml.parse(newNetchannels.input_strings)~dtd:relaxed_html40_dtd|>decode_document|>resolve?xmlbase|>remove_undesired_tagswith_->[Nethtml.Data(encode_htmls)](* Do not trust sites using XML for HTML content. Convert to string and parse
back. (Does not always fix bad HTML unfortunately.) *)lethtml_of_syndic=letns_prefix_=Some""infun?xmlbaseh->html_of_text?xmlbase(String.concat""(List.map(Syndic.XML.to_string~ns_prefix)h))letstring_of_option=functionNone->""|Somes->s(* Email on the forge contain the name in parenthesis *)letforge_name_re=Str.regexp".*(\\([^()]*\\))"letpost_comparep1p2=(* Most recent posts first. Posts with no date are always last *)match(p1.date,p2.date)with|Somed1,Somed2->Syndic.Date.compared2d1|None,Some_->1|Some_,None->-1|None,None->1letrecremovenl=ifn<=0thenlelsematchlwith[]->[]|_::tl->remove(n-1)tlletrectaken=function|[]->[]|e::tl->ifn>0thene::take(n-1)tlelse[](* Blog feed
***********************************************************************)letpost_of_atom~(feed:Feed.t)(e:Syndic.Atom.entry)=letlink=trySome(List.find(funl->l.Syndic.Atom.rel=Syndic.Atom.Alternate)e.links).hrefwithNot_found->(matche.linkswithl::_->Somel.href|[]->None)inletdate=matche.publishedwithSome_->e.published|None->Somee.updatedinletcontent=matche.contentwith|Some(Texts)->html_of_texts|Some(Html(xmlbase,s))->html_of_text?xmlbases|Some(Xhtml(xmlbase,h))->html_of_syndic?xmlbaseh|Some(Mime_)|Some(Src_)|None->(matche.summarywith|Some(Texts)->html_of_texts|Some(Html(xmlbase,s))->html_of_text?xmlbases|Some(Xhtml(xmlbase,h))->html_of_syndic?xmlbaseh|None->[])inletauthor,_=e.authorsin{title=Util.string_of_text_constructe.title;link;date;feed;author=author.name;email="";content;link_response=None;}letpost_of_rss2~(feed:Feed.t)it=lettitle,content=matchit.Syndic.Rss2.storywith|All(t,xmlbase,d)->((t,matchit.contentwith|_,""->html_of_text?xmlbased|xmlbase,c->html_of_text?xmlbasec))|Titlet->letxmlbase,c=it.contentin(t,html_of_text?xmlbasec)|Description(xmlbase,d)->(("",matchit.contentwith|_,""->html_of_text?xmlbased|xmlbase,c->html_of_text?xmlbasec))inletlink=match(it.guid,it.link)with|Someu,_whenu.permalink->Someu.data|_,Some_->it.link|Someu,_->(* Sometimes the guid is indicated with isPermaLink="false" but is
nonetheless the only URL we get (e.g. ocamlpro). *)Someu.data|None,None->Nonein{title;link;feed;author=feed.name;email=string_of_optionit.author;content;date=it.pubDate;link_response=None;}letposts_of_feedc=matchc.Feed.contentwith|Feed.Atomf->List.map(post_of_atom~feed:c)f.Syndic.Atom.entries|Feed.Rss2ch->List.map(post_of_rss2~feed:c)ch.Syndic.Rss2.itemsletstring_of_htmlhtml=letbuffer=Buffer.create1024inletchannel=newNetchannels.output_bufferbufferinlet()=Nethtml.writechannel@@encode_documenthtmlinBuffer.contentsbufferletmk_entrypost=letcontent=Syndic.Atom.Html(None,string_of_htmlpost.content)inletcontributors=[Syndic.Atom.author~uri:(Uri.of_stringpost.feed.url)post.feed.name]inletlinks=matchpost.linkwith|Somel->[Syndic.Atom.link~rel:Syndic.Atom.Alternatel]|None->[]in(* TODO: include source *)letid=matchpost.linkwith|Somel->l|None->Uri.of_string(Digest.to_hex(Digest.stringpost.title))inletauthors=(Syndic.Atom.author~email:post.emailpost.author,[])inlettitle:Syndic.Atom.text_construct=Syndic.Atom.Textpost.titleinletupdated=matchpost.datewith(* Atom entry requires a date but RSS2 does not. So if a date
* is not available, just capture the current date. *)|None->Ptime.of_float_s(Unix.gettimeofday())|>Option.get|Somed->dinSyndic.Atom.entry~content~contributors~links~id~authors~title~updated()letmk_entriesposts=List.mapmk_entrypostsletget_posts?n?(ofs=0)planet_feeds=letposts=List.concat@@List.mapposts_of_feedplanet_feedsinletposts=List.sortpost_comparepostsinletposts=removeofspostsinmatchnwithNone->posts|Somen->takenposts(* Fetch the link response and cache it. *)letfetch_linkt=match(t.link,t.link_response)with|None,_->None|Some_,Some(Okx)->Somex|Some_,Some(Error_)->None|Somelink,None->(tryletresponse=Http.get(Uri.to_stringlink)int.link_response<-Some(Okresponse);Someresponsewith_exn->t.link_response<-Some(Error"");None)