2 # Copyright (C) 2010-2011 Laurentian University
3 # Author: Dan Scott <dscott@laurentian.ca>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 # ---------------------------------------------------------------
21 use MARC::File::XML (BinaryEncoding => 'UTF-8');
24 use OpenILS::Utils::Fieldmapper;
25 use OpenSRF::Utils::SettingsClient;
26 use OpenSRF::EX qw/:try/;
28 use Unicode::Normalize;
29 use OpenILS::Application::AppUtils;
31 use Pod::Usage qw/ pod2usage /;
33 MARC::Charset->assume_unicode(1);
35 my ($start_id, $end_id, $refresh);
37 my $bootstrap = '@sysconfdir@/opensrf_core.xml';
41 my $result = GetOptions(
43 'configuration=s' => \$bootstrap,
44 'record=i' => \@records,
45 'refresh' => \$refresh,
47 'start_id=i' => \$start_id,
48 'end_id=i' => \$end_id,
49 'days_back=i' => \$days_back,
52 if (!$result or $options{help}) {
56 if ($start_id && $days_back) {
57 print "Can't use both start ID and days back!\n";
61 OpenSRF::System->bootstrap_client(config_file => $bootstrap);
62 Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
64 # must be loaded and initialized after the IDL is parsed
65 use OpenILS::Utils::CStoreEditor;
66 OpenILS::Utils::CStoreEditor::init();
68 my $e = OpenILS::Utils::CStoreEditor->new;
71 # get a list of all non-deleted records from Evergreen
72 # open-ils.cstore open-ils.cstore.direct.biblio.record_entry.id_list.atomic {"deleted":"f"}
73 $undeleted = $e->request(
74 'open-ils.cstore.direct.biblio.record_entry.id_list.atomic',
75 [{deleted => 'f'}, {id => { '>' => 0}}]
77 @records = @$undeleted;
80 if ($start_id and $end_id) {
81 @records = ($start_id .. $end_id);
84 if (defined $days_back) {
87 # Grab DB information from local settings
88 my $sc = OpenSRF::Utils::SettingsClient->new;
89 my $db_driver = $sc->config_value( reporter => setup => database => 'driver' );
90 my $db_host = $sc->config_value( reporter => setup => database => 'host' );
91 my $db_port = $sc->config_value( reporter => setup => database => 'port' );
92 my $db_name = $sc->config_value( reporter => setup => database => 'db' );
94 $db_name = $sc->config_value( reporter => setup => database => 'name' );
95 print STDERR "WARN: <database><name> is a deprecated setting for database name. For future compatibility, you should use <database><db> instead." if $db_name;
97 my $db_user = $sc->config_value( reporter => setup => database => 'user' );
98 my $db_pw = $sc->config_value( reporter => setup => database => 'pw' );
100 die "Unable to retrieve database connection information from the settings server" unless ($db_driver && $db_host && $db_port && $db_name && $db_user);
102 my $dsn = "dbi:" . $db_driver . ":dbname=" . $db_name .';host=' . $db_host . ';port=' . $db_port;
103 my $dbh = DBI->connect($dsn,$db_user,$db_pw, {AutoCommit => 1, pg_enable_utf8 => 1, RaiseError => 1}) or die "database connection error";
105 # SQL Used to gather a list of ID's
106 my $idstatement = $dbh->prepare("SELECT DISTINCT(id) AS id FROM biblio.record_entry where (date(create_date) = date(now()) or date(edit_date) = date((NOW() - '$days_back day'::interval)))");
108 # Load the list of ID's into the records array
109 $idstatement->execute();
110 while (my $ref = $idstatement->fetchrow_hashref()) {
111 my $id_ref = $ref->{"id"}; # the column name in our sql query is "id"
112 push(@records, $id_ref);
116 # print Dumper($undeleted, \@records);
118 # Hash of controlled fields & subfields in bibliographic records, and their
119 # corresponding controlling fields & subfields in the authority record
121 # So, if the bib 650$a can be controlled by an auth 150$a, that maps to:
122 # 650 => { a => { 150 => 'a'}}
124 100 => { a => { 100 => 'a' },
139 110 => { a => { 110 => 'a' },
152 111 => { a => { 111 => 'a' },
167 130 => { a => { 130 => 'a' },
182 600 => { a => { 100 => 'a' },
205 610 => { a => { 110 => 'a' },
226 611 => { a => { 111 => 'a' },
246 630 => { a => { 130 => 'a' },
265 648 => { a => { 148 => 'a' },
271 650 => { a => { 150 => 'a' },
278 651 => { a => { 151 => 'a' },
284 655 => { a => { 155 => 'a' },
290 700 => { a => { 100 => 'a' },
305 710 => { a => { 110 => 'a' },
318 711 => { a => { 111 => 'a' },
333 730 => { a => { 130 => 'a' },
348 751 => { a => { 151 => 'a' },
354 800 => { a => { 100 => 'a' },
371 830 => { a => { 130 => 'a' },
388 foreach my $rec_id (@records) {
391 # State variable; was the record changed?
395 my $record = $e->retrieve_biblio_record_entry($rec_id);
397 # print Dumper($record);
400 my $marc = MARC::Record->new_from_xml($record->marc());
402 # get the list of controlled fields
403 my @c_fields = keys %controllees;
405 foreach my $c_tag (@c_fields) {
406 my @c_subfields = keys %{$controllees{"$c_tag"}};
407 # print "Field: $field subfields: ";
408 # foreach (@subfields) { print "$_ "; }
410 # Get the MARCXML from the record and check for controlled fields/subfields
411 my @bib_fields = ($marc->field($c_tag));
412 foreach my $bib_field (@bib_fields) {
413 # print $_->as_formatted();
415 if ($refresh and defined(scalar($bib_field->subfield('0')))) {
416 $bib_field->delete_subfield(code => '0');
423 foreach my $c_subfield (@c_subfields) {
424 my @sf_values = $bib_field->subfield($c_subfield);
426 # Give me the first element of the list of authority controlling tags for this subfield
427 # XXX Will we need to support more than one controlling tag per subfield? Probably. That
428 # will suck. Oh well, leave that up to Ole to implement.
429 $match_subfields{$c_subfield} = (keys %{$controllees{$c_tag}{$c_subfield}})[0];
430 $match_tag = $match_subfields{$c_subfield};
431 push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values;
434 # print Dumper(\%match_subfields);
437 my @tags = ($match_tag);
439 # print "Controlling tag: $c_tag and match tag $match_tag\n";
440 # print Dumper(\@tags, \@searches);
442 # Now we've built up a complete set of matching controlled
443 # subfields for this particular field; let's check to see if
444 # we have a matching authority record
445 my $session = OpenSRF::AppSession->create("open-ils.search");
446 my $validates = $session->request("open-ils.search.authority.validate.tag.id_list",
447 "tags", \@tags, "searches", \@searches
449 $session->disconnect();
451 # print Dumper($validates);
453 # Protect against failed (error condition) search request
455 print STDERR "Search for matching authority failed; record # $rec_id\n";
459 # Only add linking if one or more was found, but we may have changed
460 # the record already if in --refresh mode.
461 if (scalar(@$validates) > 0) {
463 # Iterate through the returned authority record IDs to delete any
464 # matching $0 subfields already in the bib record
465 foreach my $auth_zero (@$validates) {
466 $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/);
469 # Okay, we have a matching authority control; time to
470 # add the magical subfield 0. Use the first returned auth
472 my $auth_id = @$validates[0];
473 my $auth_rec = $e->retrieve_authority_record_entry($auth_id);
474 my $auth_marc = MARC::Record->new_from_xml($auth_rec->marc());
475 my $cni = $auth_marc->field('003')->data();
477 $bib_field->add_subfields('0' => "($cni)$auth_id");
483 my $editor = OpenILS::Utils::CStoreEditor->new(xact=>1);
484 # print $marc->as_formatted();
485 my $xml = $marc->as_xml_record();
487 $xml =~ s/^<\?xml.+\?\s*>//go;
488 $xml =~ s/>\s+</></go;
489 $xml =~ s/\p{Cc}//go;
490 $xml = OpenILS::Application::AppUtils->entityize($xml);
493 $editor->update_biblio_record_entry($record);
498 print STDERR "\nRecord # $rec_id : $err\n";
499 import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire process
507 authority_control_fields.pl - Controls fields in bibliographic records with authorities in Evergreen
511 C<authority_control_fields.pl> [B<--configuration>=I<opensrf_core.conf>] [B<--refresh>]
512 [[B<--record>=I<record>[ B<--record>=I<record>]]] | [B<--all>] | [B<--start_id>=I<start-ID> B<--end_id>=I<end-ID>] |
513 [B<--days_back>=I<number-of-days>]
517 For a given set of records:
521 =item * Iterate through the list of fields that are controlled fields
523 =item * Iterate through the list of subfields that are controlled for
526 =item * Search for a matching authority record for that combination of
531 =item * If we find a match, then add a $0 subfield to that field identifying
532 the controlling authority record
534 =item * If we do not find a match, then insert a row into an "uncontrolled"
535 table identifying the record ID, field, and subfield(s) that were not controlled
539 =item * Iterate through the list of floating subdivisions
543 =item * If we find a match, then add a $0 subfield to that field identifying
544 the controlling authority record
546 =item * If we do not find a match, then insert a row into an "uncontrolled"
547 table identifying the record ID, field, and subfield(s) that were not controlled
551 =item * If we changed the record, update it in the database
559 =item * B<-c> I<config-file>, B<--configuration>=I<config-file>
561 Specifies the OpenSRF configuration file used to connect to the OpenSRF router.
562 Defaults to F<@sysconfdir@/opensrf_core.xml>
564 =item * B<-r> I<record-ID>, B<--record>=I<record-ID>
566 Specifies the bibliographic record ID (found in the C<biblio.record_entry.id>
567 column) of the record to process. This option may be specified more than once
568 to process multiple records in a single run.
570 =item * B<-a>, B<--all>
572 Specifies that all bibliographic records should be processed. For large
573 databases, this may take an extraordinarily long amount of time.
575 =item * B<-r>, B<--refresh>
577 Specifies that all authority links should be removed from the target
578 bibliographic record(s). This will effectively rewrite all authority
581 =item * B<-s> I<start-ID>, B<--start_id>=I<start-ID>
583 Specifies the starting ID of the range of bibliographic records to process.
584 This option is ignored unless it is accompanied by the B<-e> or B<--end_id>
587 =item * B<-e> I<end-ID>, B<--end_id>=I<end-ID>
589 Specifies the ending ID of the range of bibliographic records to process.
590 This option is ignored unless it is accompanied by the B<-s> or B<--start_id>
593 =item * B<--days_back>=I<number-of-days>
595 Specifies that only bibliographic records that have been created in the
596 past few days should be processed. You must specify how many days back
597 to include. This option is incompatible with the B<-s> and B<--start_id>
604 authority_control_fields.pl --start_id 1 --end_id 50000
606 Processes the bibliographic records with IDs between 1 and 50,000 using the
607 default OpenSRF configuration file for connection information.
611 Dan Scott <dscott@laurentian.ca>
613 =head1 COPYRIGHT AND LICENSE
615 Copyright 2010-2011 by Dan Scott
617 This program is free software; you can redistribute it and/or
618 modify it under the terms of the GNU General Public License
619 as published by the Free Software Foundation; either version 2
620 of the License, or (at your option) any later version.
622 This program is distributed in the hope that it will be useful,
623 but WITHOUT ANY WARRANTY; without even the implied warranty of
624 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
625 GNU General Public License for more details.
627 You should have received a copy of the GNU General Public License
628 along with this program; if not, write to the Free Software
629 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.