diff options
| author | William Harrington <kb0iic@berzerkula.org> | 2014-11-27 16:29:31 -0600 | 
|---|---|---|
| committer | William Harrington <kb0iic@berzerkula.org> | 2014-11-27 16:29:31 -0600 | 
| commit | 7df987a652b93d55dd8eca363706d3bacc469b55 (patch) | |
| tree | 17f84ce0fee76a3a0476b714eae09486ab2470e6 /stylesheets/lfs-xsl/docbook-xsl-snapshot/fo/pdf2index | |
Diffstat (limited to 'stylesheets/lfs-xsl/docbook-xsl-snapshot/fo/pdf2index')
| -rw-r--r-- | stylesheets/lfs-xsl/docbook-xsl-snapshot/fo/pdf2index | 140 | 
1 files changed, 140 insertions, 0 deletions
diff --git a/stylesheets/lfs-xsl/docbook-xsl-snapshot/fo/pdf2index b/stylesheets/lfs-xsl/docbook-xsl-snapshot/fo/pdf2index new file mode 100644 index 0000000..c14d8ec --- /dev/null +++ b/stylesheets/lfs-xsl/docbook-xsl-snapshot/fo/pdf2index @@ -0,0 +1,140 @@ +#!/usr/bin/perl -- # -*- Perl -*- + +# this needs some cleanup... + +my $PSTOTEXT = "pstotext"; + +my $pdf = shift @ARGV; + +my $index = ""; +my $inindex = 0; +open (F, "$PSTOTEXT $pdf |"); +while (<F>) { +    if (/^<\/index/) { +	$index .= $_; +	$inindex = 0; +    } +    $inindex = 1 if /^<index/; + +    if ($inindex) { +	$index .= $_ if /^\s*</; +    } +} + +my $cindex = ""; +while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) { +    $cindex .= $1; +    $_ = $2; +    $index = $'; # ' + +    my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg; + +    # Expand ranges +    if ($#pages >= 0) { +	my @mpages = (); +	foreach my $page (@pages) { +	    my $pageno = &pageno($page); +	    if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky - +		for (my $count = $1; $count <= $2; $count++) { +		    push (@mpages, "<phrase role=\"$pageno\">$count</phrase>"); +		} +	    } else { +		push (@mpages, $page); +	    } +	} +	@pages = sort rangesort @mpages; +    } + +    # Remove duplicates... +    if ($#pages > 0) { +	my @mpages = (); +	my $current = ""; +	foreach my $page (@pages) { +	    my $pageno = &pageno($page); +	    if ($pageno ne $current) { +		push (@mpages, $page); +		$current = $pageno; +	    } +	} +	@pages = @mpages; +    } + +    # Collapse ranges... +    if ($#pages > 1) { +	my @cpages = (); +	while (@pages) { +	    my $count = 0; +	    my $len = &rangelen($count, @pages); +	    if ($len <= 2) { +		my $page = shift @pages; +		push (@cpages, $page); +	    } else { +		my $fpage = shift @pages; +		my $lpage = ""; +		while ($len > 1) { +		    $lpage = shift @pages; +		    $len--; +		} +		my $fpno = &pageno($fpage); +		my $lpno = &pageno($lpage); +		$fpage =~ s/>$fpno</>${fpno}-$lpno</s; +		push (@cpages, $fpage); +	    } +	} +	@pages = @cpages; +    } + +    my $page = shift @pages; +    $page =~ s/\s*$//s; +    $cindex .= $page; +    while (@pages) { +	$page = shift @pages; +	$page =~ s/\s*$//s; +	$cindex .= ", $page"; +    } +} +$cindex .= $index; + +print "$cindex\n"; + +sub pageno { +    my $page = shift; + +    $page =~ s/^<phrase.*?>//; +    $page =~ s/^<link.*?>//; + +    return $1 if $page =~ /^([^<>]+)/; +    return "?"; +} + +sub rangesort { +    my $apno = &pageno($a); +    my $bpno = &pageno($b); + +    # Make sure roman pages come before arabic ones, otherwise sort them in order +    return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/); +    return  1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/); +    return $apno <=> $bpno; +} + +sub rangelen { +    my $count = shift; +    my @pages = @_; +    my $len = 1; +    my $inrange = 1; + +    my $current = &pageno($pages[$count]); +    while ($count < $#pages && $inrange) { +	$count++; +	my $next = &pageno($pages[$count]); +	if ($current + 1 eq $next) { +	    $current = $next; +	    $inrange = 1; +	    $len++; +	} else { +	    $inrange = 0; +	} +    } + +    return $len; +}  | 
