From 5002cc34dfb57841dfa2cdacfde470888e793180 Mon Sep 17 00:00:00 2001
From: Alex Schroeder
Date: Wed, 31 Aug 2005 13:12:13 +0000
Subject: [PATCH] Added lots of indexes for the Elisp Manual and lots of hacks
to make the HTML mess parsable.
---
info-ref | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 46 insertions(+), 4 deletions(-)
diff --git a/info-ref b/info-ref
index 9132ed99..d0041def 100644
--- a/info-ref
+++ b/info-ref
@@ -10,7 +10,24 @@ use URI;
my @indexes = qw(
http://www.gnu.org/software/emacs/manual/html_node/Command-Index.html
http://www.gnu.org/software/emacs/manual/html_node/Variable-Index.html
- http://www.gnu.org/software/emacs/manual/html_node/Concep-Index.html
+ http://www.gnu.org/software/emacs/manual/html_node/Concept-Index.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_728.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_729.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_730.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_731.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_732.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_733.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_734.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_735.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_736.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_737.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_738.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_739.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_740.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_741.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_742.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_743.html
+ http://www.gnu.org/software/emacs/elisp-manual/html_node/elisp_744.html
);
my $db = '/org/org.emacswiki/htdocs/emacs/info-ref.dat';
@@ -64,16 +81,29 @@ sub Find {
sub Initialize {
my %map = ();
+ print $q->header, $q->start_html;
foreach my $url (@indexes) {
+ print $q->p($url);;
# determine base URI
my $base = URI->new($url);
# fetch and parse data
my $data = GetRaw($url);
+ # some markup fixes for the elisp manual
+ $data =~ s/&([<"])/&$1/g;
+ $data =~ s/<([<"])/<$1/g;
+ $data =~ s/="fn_"">/="fn_"">/;
+ $data =~ s///;
+ $data =~ s'?font.*?>''gi;
+ $data =~ s'
'
';
my $parser = XML::LibXML->new();
- my $doc = $parser->parse_html_string($data);
+ my $doc;
+ eval { $doc = $parser->parse_html_string($data); };
+ print $q->p($@) if $@;
+ next if $@;
+ # emacs manual
my @nodelist = $doc->findnodes('/html/body/ul/li');
foreach my $node (@nodelist) {
- my $text = $node->textContent();
+ my $text = $node->textContent;
my ($key) = split(/: /, $text);
my $a = $node->findnodes('descendant::a')->[0];
my $label = $a->textContent;
@@ -83,6 +113,18 @@ sub Initialize {
$map{$key} = () unless $map{$key};
$map{$key}{$l->canonical} = $label;
}
+ # elisp manual
+ @nodelist = $doc->findnodes('/html/body/table/tbody/tr/td');
+ foreach my $node (@nodelist) {
+ my ($item, $section) = $node->findnodes('descendant::a');
+ my $key = $item->textContent;
+ my $label = $section->textContent;
+ my $link = $item->getAttribute('href');
+ my $l = URI->new_abs($link, $base);
+ print "$key -> $label $l\n";
+ $map{$key} = () unless $map{$key};
+ $map{$key}{$l->canonical} = $label;
+ }
}
my $data = join($nl, map {
my $key = $_;
@@ -92,7 +134,7 @@ sub Initialize {
} keys %{$map{$_}})
} keys %map);
WriteStringToFile($db, $data);
- ReportError('Database initialized', '200 OK');
+ print $q->p('Database initialized'), $q->end_html;
}
sub GetRaw {