From 5002cc34dfb57841dfa2cdacfde470888e793180 Mon Sep 17 00:00:00 2001 From: Alex Schroeder Date: Wed, 31 Aug 2005 13:12:13 +0000 Subject: [PATCH] Added lots of indexes for the Elisp Manual and lots of hacks to make the HTML mess parsable. --- info-ref | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/info-ref b/info-ref index 9132ed99..d0041def 100644 --- a/info-ref +++ b/info-ref @@ -10,7 +10,24 @@ use URI; my @indexes = qw( http://www.gnu.org/software/emacs/manual/html_node/Command-Index.html http://www.gnu.org/software/emacs/manual/html_node/Variable-Index.html - http://www.gnu.org/software/emacs/manual/html_node/Concep-Index.html + http://www.gnu.org/software/emacs/manual/html_node/Concept-Index.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_728.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_729.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_730.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_731.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_732.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_733.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_734.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_735.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_736.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_737.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_738.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_739.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_740.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_741.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_742.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_743.html + http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_744.html ); my $db = '/org/org.emacswiki/htdocs/emacs/info-ref.dat'; @@ -64,16 +81,29 @@ sub Find { sub Initialize { my %map = (); + print $q->header, $q->start_html; foreach my $url (@indexes) { + print $q->p($url);; # determine base URI my $base = URI->new($url); # fetch and parse data my $data = GetRaw($url); + # some markup fixes for the elisp manual + $data =~ s/&([<"])/&$1/g; + $data =~ s/<([<"])/<$1/g; + $data =~ s/="fn_"">/="fn_"">/; + $data =~ s///; + $data =~ s'''gi; + $data =~ s'

'
'; my $parser = XML::LibXML->new(); - my $doc = $parser->parse_html_string($data); + my $doc; + eval { $doc = $parser->parse_html_string($data); }; + print $q->p($@) if $@; + next if $@; + # emacs manual my @nodelist = $doc->findnodes('/html/body/ul/li'); foreach my $node (@nodelist) { - my $text = $node->textContent(); + my $text = $node->textContent; my ($key) = split(/: /, $text); my $a = $node->findnodes('descendant::a')->[0]; my $label = $a->textContent; @@ -83,6 +113,18 @@ sub Initialize { $map{$key} = () unless $map{$key}; $map{$key}{$l->canonical} = $label; } + # elisp manual + @nodelist = $doc->findnodes('/html/body/table/tbody/tr/td'); + foreach my $node (@nodelist) { + my ($item, $section) = $node->findnodes('descendant::a'); + my $key = $item->textContent; + my $label = $section->textContent; + my $link = $item->getAttribute('href'); + my $l = URI->new_abs($link, $base); + print "$key -> $label $l\n"; + $map{$key} = () unless $map{$key}; + $map{$key}{$l->canonical} = $label; + } } my $data = join($nl, map { my $key = $_; @@ -92,7 +134,7 @@ sub Initialize { } keys %{$map{$_}}) } keys %map); WriteStringToFile($db, $data); - ReportError('Database initialized', '200 OK'); + print $q->p('Database initialized'), $q->end_html; } sub GetRaw {