1 #!/usr/bin/perl -- # -*- Perl -*-
\r
3 # this needs some cleanup...
\r
5 my $PSTOTEXT = "pstotext";
\r
7 my $pdf = shift @ARGV;
\r
11 open (F, "$PSTOTEXT $pdf |");
\r
17 $inindex = 1 if /^<index/;
\r
20 $index .= $_ if /^\s*</;
\r
25 while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
\r
30 my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
\r
35 foreach my $page (@pages) {
\r
36 my $pageno = &pageno($page);
\r
37 if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
\r
38 for (my $count = $1; $count <= $2; $count++) {
\r
39 push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
\r
42 push (@mpages, $page);
\r
45 @pages = sort rangesort @mpages;
\r
48 # Remove duplicates...
\r
52 foreach my $page (@pages) {
\r
53 my $pageno = &pageno($page);
\r
54 if ($pageno ne $current) {
\r
55 push (@mpages, $page);
\r
62 # Collapse ranges...
\r
67 my $len = &rangelen($count, @pages);
\r
69 my $page = shift @pages;
\r
70 push (@cpages, $page);
\r
72 my $fpage = shift @pages;
\r
75 $lpage = shift @pages;
\r
78 my $fpno = &pageno($fpage);
\r
79 my $lpno = &pageno($lpage);
\r
80 $fpage =~ s/>$fpno</>${fpno}-$lpno</s;
\r
81 push (@cpages, $fpage);
\r
87 my $page = shift @pages;
\r
91 $page = shift @pages;
\r
93 $cindex .= ", $page";
\r
103 $page =~ s/^<phrase.*?>//;
\r
104 $page =~ s/^<link.*?>//;
\r
106 return $1 if $page =~ /^([^<>]+)/;
\r
111 my $apno = &pageno($a);
\r
112 my $bpno = &pageno($b);
\r
114 # Make sure roman pages come before arabic ones, otherwise sort them in order
\r
115 return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
\r
116 return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
\r
117 return $apno <=> $bpno;
\r
126 my $current = &pageno($pages[$count]);
\r
127 while ($count < $#pages && $inrange) {
\r
129 my $next = &pageno($pages[$count]);
\r
130 if ($current + 1 eq $next) {
\r