2 # Copyright (C) 2014 Laurentian University
3 # Author: Dan Scott <dscott@laurentian.ca>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 use strict; use warnings;
21 use DBI qw(:sql_types);
22 use DBD::Pg qw(:pg_types);
24 my ($dbhost, $dbport, $dbname, $dbuser, $dbpw, $help);
28 =item create_sitemaps() - Write the sitemap files
30 With a maximum of 50,000 URLs per sitemap, this method
31 automatically increments the sitemap file numbers and
32 generates a corresponding sitemap index that lists all
33 of the individual sitemap files.
35 See http://www.sitemaps.org/ for the specification
39 my ($settings, $bibs, $aou_id) = @_;
44 my $fn = $settings->{'prefix'} . "sitemap$f_cnt.xml";
46 open(FH, '>', $fn) or die "Could not write sitemap $f_cnt\n";
47 print FH '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
48 print FH '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
50 foreach my $bib (@$bibs) {
51 print FH "<url><loc>" . $settings->{'lib-hostname'} . "/eg/opac/record/" . $bib->[0];
53 print FH "?locg=$aou_id";
55 print FH "</loc><lastmod>" . $bib->[1] . "</lastmod></url>\n";
57 if ($r_cnt % 50000 == 0) {
59 print FH "</urlset>\n";
61 my $fn = $settings->{'prefix'} . "sitemap$f_cnt.xml";
63 open(FH, '>', $fn) or die "Could not write bibs\n";
64 print FH '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
65 print FH '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
68 print FH "</urlset>\n";
71 open(INDEXFH, '>', $settings->{'prefix'} . "sitemapindex.xml") or die "Could not write sitemap index\n";
72 print INDEXFH '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
73 print INDEXFH '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
74 foreach my $fn (@sitemaps) {
75 print INDEXFH "<sitemap><loc>" . $settings->{'lib-hostname'} . "/$fn</loc></sitemap>\n";
77 print INDEXFH "</sitemapindex>\n";
83 =item get_settings() - Extracts database settings from opensrf.xml
88 my $host = "/opensrf/default/apps/open-ils.reporter-store/app_settings/database/host/text()";
89 my $port = "/opensrf/default/apps/open-ils.reporter-store/app_settings/database/port/text()";
90 my $dbname = "/opensrf/default/apps/open-ils.reporter-store/app_settings/database/db/text()";
91 my $user = "/opensrf/default/apps/open-ils.reporter-store/app_settings/database/user/text()";
92 my $pw = "/opensrf/default/apps/open-ils.reporter-store/app_settings/database/pw/text()";
94 my $parser = XML::LibXML->new();
95 my $opensrf_config = $parser->parse_file($config_file);
97 # If the user passed in settings at the command line,
98 # we don't want to override them
99 $settings->{host} = $settings->{host} || $opensrf_config->findnodes($host);
100 $settings->{port} = $settings->{port} || $opensrf_config->findnodes($port);
101 $settings->{db} = $settings->{db} || $opensrf_config->findnodes($dbname);
102 $settings->{user} = $settings->{user} || $opensrf_config->findnodes($user);
103 $settings->{pw} = $settings->{pw} || $opensrf_config->findnodes($pw);
106 =item get_record_ids() - Gets a list of record IDs
109 my $settings = shift;
112 my $dbh = DBI->connect('dbi:Pg:dbname=' . $settings->{db} .
113 ';host=' . $settings->{host} . ';port=' . $settings->{port} . ';',
114 $settings->{user} . "", $settings->{pw} . "", {AutoCommit => 1}
117 print STDERR "Could not connect to database. ";
118 print STDERR "Error was " . $dbh->errstr . "\n";
122 if ($settings->{'lib-shortname'}) {
123 my $stmt = $dbh->prepare("SELECT id FROM actor.org_unit WHERE shortname = ?");
124 $stmt->execute(($settings->{'lib-shortname'}));
125 my $rv = $stmt->bind_columns(\$aou_id);
131 SELECT ?::date AS val
139 WHERE id IN (SELECT id FROM actor.org_unit_descendants(?))
144 WHERE id IN (SELECT id FROM actor.org_unit_ancestors(?))
145 AND id NOT IN (SELECT id FROM org_top())
150 SELECT DISTINCT id, edit_date FROM (
153 WHEN bre.edit_date::date < (SELECT val FROM date_floor LIMIT 1) THEN (SELECT val FROM date_floor LIMIT 1)
154 ELSE bre.edit_date::date
156 FROM biblio.record_entry bre
157 INNER JOIN asset.opac_visible_copies aovc ON bre.id = aovc.record
160 $q .= " WHERE circ_lib IN (SELECT id FROM copy_orgs)";
166 WHEN bre.edit_date::date < (SELECT val FROM date_floor LIMIT 1) THEN (SELECT val FROM date_floor LIMIT 1)
167 ELSE bre.edit_date::date
169 FROM biblio.record_entry bre
170 INNER JOIN asset.call_number acn ON bre.id = acn.record
171 WHERE bre.deleted IS FALSE AND acn.deleted IS FALSE
175 AND owning_lib IN (SELECT id FROM uri_orgs) AND label = '##URI##'
180 ORDER BY edit_date DESC, id DESC
182 my $stmt = $dbh->prepare($q);
184 $stmt->bind_param(1, $settings->{'date'}, { pg_type => PG_DATE });
185 $stmt->bind_param(2, $aou_id, SQL_INTEGER);
186 $stmt->bind_param(3, $aou_id, SQL_INTEGER);
188 $stmt->bind_param(1, $settings->{'date'}, { pg_type => PG_DATE });
192 my $bibs = $stmt->fetchall_arrayref([0, 1]);
195 print STDERR "Error was " . $dbh->errstr . "\n";
198 return ($bibs, $aou_id);
209 "lib-hostname=s" => \$settings{'lib-hostname'},
210 "lib-shortname=s" => \$settings{'lib-shortname'},
211 "prefix=s" => \$settings{'prefix'},
212 "date-floor=s" => \$settings{'date'},
213 "config-file=s" => \$config_file,
214 "user=s" => \$settings{'user'},
215 "password=s" => \$settings{'pw'},
216 "database=s" => \$settings{'db'},
217 "hostname=s" => \$settings{'host'},
218 "port=i" => \$settings{'port'},
223 my @temp = `eg_config --sysconfdir`;
225 $sysconfdir = $temp[0];
226 $config_file = File::Spec->catfile($sysconfdir, "opensrf.xml");
229 unless (-e $config_file) { die "Error: $config_file does not exist. \n"; }
231 if ($settings{'lib-hostname'}) {
232 # Get additional settings from the config file
233 get_settings(\%settings);
235 my ($bibs, $aou_id) = get_record_ids(\%settings);
236 create_sitemaps(\%settings, $bibs, $aou_id);
245 sitemap_generator [OPTION] ... [COMMAND] ... [CONFIG OPTIONS]
248 Creates a set of sitemaps for enabling web crawlers to crawl
249 freshly changed bibliographic records.
253 specifies the opensrf.xml file
256 REQUIRED: hostname for the catalog (e.g "https://example.com")
259 filename to add as a prefix to the generated set of sitemap files
262 a date in YYYY-MM-DD format that specifies the minimum date that
263 should be reflected for when a record was last updated; useful if
264 you enrich or change the HTML without changing records. Defaults
268 include all records for the specified library and its children;
269 defaults to all records
272 This script will normally be run as a cron job by the opensrf user from
273 the web root directory.
275 sitemap_generator --lib-hostname https://example.com --lib-shortname BR1 \
278 This generates a set of sitemap files like so:
279 * example_sitemapindex.xml
280 * example_sitemap1.xml
281 * example_sitemap2.xml