2 # Copyright (C) 2010-2011 Laurentian University
3 # Author: Dan Scott <dscott@laurentian.ca>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 # ---------------------------------------------------------------
21 use MARC::File::XML (BinaryEncoding => 'UTF-8');
24 use OpenILS::Utils::Fieldmapper;
25 use OpenSRF::Utils::SettingsClient;
26 use OpenSRF::EX qw/:try/;
28 use Unicode::Normalize;
29 use OpenILS::Application::AppUtils;
31 use Pod::Usage qw/ pod2usage /;
33 MARC::Charset->assume_unicode(1);
35 my ($start_id, $end_id, $refresh);
37 my $bootstrap = '@sysconfdir@/opensrf_core.xml';
41 my $result = GetOptions(
43 'configuration=s' => \$bootstrap,
44 'record=i' => \@records,
45 'refresh' => \$refresh,
47 'start_id=i' => \$start_id,
48 'end_id=i' => \$end_id,
49 'days_back=i' => \$days_back,
52 if (!$result or $options{help}) {
56 if ($start_id && $days_back) {
57 print "Can't use both start ID and days back!\n";
61 OpenSRF::System->bootstrap_client(config_file => $bootstrap);
62 Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
64 # must be loaded and initialized after the IDL is parsed
65 use OpenILS::Utils::CStoreEditor;
66 OpenILS::Utils::CStoreEditor::init();
68 my $e = OpenILS::Utils::CStoreEditor->new;
71 # get a list of all non-deleted records from Evergreen
72 # open-ils.cstore open-ils.cstore.direct.biblio.record_entry.id_list.atomic {"deleted":"f"}
73 $undeleted = $e->request(
74 'open-ils.cstore.direct.biblio.record_entry.id_list.atomic',
75 [{deleted => 'f'}, {id => { '>' => 0}}]
77 @records = @$undeleted;
80 if ($start_id and $end_id) {
81 @records = ($start_id .. $end_id);
84 if (defined $days_back) {
87 # Grab DB information from local settings
88 my $sc = OpenSRF::Utils::SettingsClient->new;
89 my $db_driver = $sc->config_value( reporter => setup => database => 'driver' );
90 my $db_host = $sc->config_value( reporter => setup => database => 'host' );
91 my $db_port = $sc->config_value( reporter => setup => database => 'port' );
92 my $db_name = $sc->config_value( reporter => setup => database => 'db' );
94 $db_name = $sc->config_value( reporter => setup => database => 'name' );
95 print STDERR "WARN: <database><name> is a deprecated setting for database name. For future compatibility, you should use <database><db> instead." if $db_name;
97 my $db_user = $sc->config_value( reporter => setup => database => 'user' );
98 my $db_pw = $sc->config_value( reporter => setup => database => 'pw' );
100 die "Unable to retrieve database connection information from the settings server" unless ($db_driver && $db_host && $db_port && $db_name && $db_user);
102 my $dsn = "dbi:" . $db_driver . ":dbname=" . $db_name .';host=' . $db_host . ';port=' . $db_port;
103 my $dbh = DBI->connect($dsn,$db_user,$db_pw, {AutoCommit => 1, pg_enable_utf8 => 1, RaiseError => 1}) or die "database connection error";
105 # SQL Used to gather a list of ID's
106 my $idstatement = $dbh->prepare("SELECT DISTINCT(id) AS id FROM biblio.record_entry where (date(create_date) = date(now()) or date(edit_date) = date((NOW() - '$days_back day'::interval)))");
108 # Load the list of ID's into the records array
109 $idstatement->execute();
110 while (my $ref = $idstatement->fetchrow_hashref()) {
111 my $id_ref = $ref->{"id"}; # the column name in our sql query is "id"
112 push(@records, $id_ref);
116 # print Dumper($undeleted, \@records);
118 # Hash of controlled fields & subfields in bibliographic records, and their
119 # corresponding controlling fields & subfields in the authority record
121 # So, if the bib 650$a can be controlled by an auth 150$a, that maps to:
122 # 650 => { a => { 150 => 'a'}}
124 100 => { a => { 100 => 'a' },
139 110 => { a => { 110 => 'a' },
152 111 => { a => { 111 => 'a' },
167 130 => { a => { 130 => 'a' },
182 600 => { a => { 100 => 'a' },
205 610 => { a => { 110 => 'a' },
226 611 => { a => { 111 => 'a' },
246 630 => { a => { 130 => 'a' },
265 648 => { a => { 148 => 'a' },
271 650 => { a => { 150 => 'a' },
278 651 => { a => { 151 => 'a' },
284 655 => { a => { 155 => 'a' },
290 700 => { a => { 100 => 'a' },
305 710 => { a => { 110 => 'a' },
318 711 => { a => { 111 => 'a' },
333 730 => { a => { 130 => 'a' },
348 751 => { a => { 151 => 'a' },
354 800 => { a => { 100 => 'a' },
371 830 => { a => { 130 => 'a' },
388 foreach my $rec_id (@records) {
391 # State variable; was the record changed?
395 my $record = $e->retrieve_biblio_record_entry($rec_id);
397 # print Dumper($record);
400 my $marc = MARC::Record->new_from_xml($record->marc());
402 # get the list of controlled fields
403 my @c_fields = keys %controllees;
405 foreach my $c_tag (@c_fields) {
406 my @c_subfields = keys %{$controllees{"$c_tag"}};
407 # print "Field: $field subfields: ";
408 # foreach (@subfields) { print "$_ "; }
410 # Get the MARCXML from the record and check for controlled fields/subfields
411 my @bib_fields = ($marc->field($c_tag));
412 foreach my $bib_field (@bib_fields) {
413 # print $_->as_formatted();
415 if ($refresh and defined(scalar($bib_field->subfield('0')))) {
416 $bib_field->delete_subfield(code => '0');
423 foreach my $c_subfield (@c_subfields) {
424 my @sf_values = $bib_field->subfield($c_subfield);
426 # Give me the first element of the list of authority controlling tags for this subfield
427 # XXX Will we need to support more than one controlling tag per subfield? Probably. That
428 # will suck. Oh well, leave that up to Ole to implement.
429 $match_subfields{$c_subfield} = (keys %{$controllees{$c_tag}{$c_subfield}})[0];
430 $match_tag = $match_subfields{$c_subfield};
431 push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values;
434 # print Dumper(\%match_subfields);
437 my @tags = ($match_tag);
439 # print "Controlling tag: $c_tag and match tag $match_tag\n";
440 # print Dumper(\@tags, \@searches);
442 # Now we've built up a complete set of matching controlled
443 # subfields for this particular field; let's check to see if
444 # we have a matching authority record
445 my $session = OpenSRF::AppSession->create("open-ils.search");
446 my $validates = $session->request("open-ils.search.authority.validate.tag.id_list",
447 "tags", \@tags, "searches", \@searches
449 $session->disconnect();
451 # print Dumper($validates);
453 # Protect against failed (error condition) search request
455 print STDERR "Search for matching authority failed; record # $rec_id\n";
459 # Only add linking if one or more was found, but we may have changed
460 # the record already if in --refresh mode.
461 if (scalar(@$validates) > 0) {
463 # Iterate through the returned authority record IDs to delete any
464 # matching $0 subfields already in the bib record
465 foreach my $auth_zero (@$validates) {
466 $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/);
469 # Okay, we have a matching authority control; time to
470 # add the magical subfield 0. Use the first returned auth
472 my $auth_id = @$validates[0];
473 my $auth_rec = $e->retrieve_authority_record_entry($auth_id);
474 my $auth_marc = MARC::Record->new_from_xml($auth_rec->marc());
475 my $cni = $auth_marc->field('003')->data();
477 $bib_field->add_subfields('0' => "($cni)$auth_id");
483 my $editor = OpenILS::Utils::CStoreEditor->new(xact=>1);
484 # print $marc->as_formatted();
485 my $xml = $marc->as_xml_record();
487 $xml =~ s/^<\?xml.+\?\s*>//go;
488 $xml =~ s/>\s+</></go;
489 $xml =~ s/\p{Cc}//go;
490 $xml = OpenILS::Application::AppUtils->entityize($xml);
493 $editor->update_biblio_record_entry($record);
498 print STDERR "\nRecord # $rec_id : $err\n";
499 import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire process
507 authority_control_fields.pl - Controls fields in bibliographic records with authorities in Evergreen
511 C<authority_control_fields.pl> [B<--configuration>=I<opensrf_core.conf>] [B<--refresh>]
512 [[B<--record>=I<record>[ B<--record>=I<record>]]] | [B<--all>] | [B<--start_id>=I<start-ID> B<--end_id>=I<end-ID>]
516 For a given set of records:
520 =item * Iterate through the list of fields that are controlled fields
522 =item * Iterate through the list of subfields that are controlled for
525 =item * Search for a matching authority record for that combination of
530 =item * If we find a match, then add a $0 subfield to that field identifying
531 the controlling authority record
533 =item * If we do not find a match, then insert a row into an "uncontrolled"
534 table identifying the record ID, field, and subfield(s) that were not controlled
538 =item * Iterate through the list of floating subdivisions
542 =item * If we find a match, then add a $0 subfield to that field identifying
543 the controlling authority record
545 =item * If we do not find a match, then insert a row into an "uncontrolled"
546 table identifying the record ID, field, and subfield(s) that were not controlled
550 =item * If we changed the record, update it in the database
558 =item * B<-c> I<config-file>, B<--configuration>=I<config-file>
560 Specifies the OpenSRF configuration file used to connect to the OpenSRF router.
561 Defaults to F<@sysconfdir@/opensrf_core.xml>
563 =item * B<-r> I<record-ID>, B<--record>=I<record-ID>
565 Specifies the bibliographic record ID (found in the C<biblio.record_entry.id>
566 column) of the record to process. This option may be specified more than once
567 to process multiple records in a single run.
569 =item * B<-a>, B<--all>
571 Specifies that all bibliographic records should be processed. For large
572 databases, this may take an extraordinarily long amount of time.
574 =item * B<-r>, B<--refresh>
576 Specifies that all authority links should be removed from the target
577 bibliographic record(s). This will effectively rewrite all authority
580 =item * B<-s> I<start-ID>, B<--start_id>=I<start-ID>
582 Specifies the starting ID of the range of bibliographic records to process.
583 This option is ignored unless it is accompanied by the B<-e> or B<--end_id>
586 =item * B<-e> I<end-ID>, B<--end_id>=I<end-ID>
588 Specifies the ending ID of the range of bibliographic records to process.
589 This option is ignored unless it is accompanied by the B<-s> or B<--start>
592 =item * B<--days_back>=I<number-of-days>
594 Specifies that only bibliographic records that have been created in the
595 past few days should be processed. You must specify how many days back
596 to include. This option is incompatible with the B<-s> and B<--start>
603 authority_control_fields.pl --start_id 1 --end_id 50000
605 Processes the bibliographic records with IDs between 1 and 50,000 using the
606 default OpenSRF configuration file for connection information.
610 Dan Scott <dscott@laurentian.ca>
612 =head1 COPYRIGHT AND LICENSE
614 Copyright 2010-2011 by Dan Scott
616 This program is free software; you can redistribute it and/or
617 modify it under the terms of the GNU General Public License
618 as published by the Free Software Foundation; either version 2
619 of the License, or (at your option) any later version.
621 This program is distributed in the hope that it will be useful,
622 but WITHOUT ANY WARRANTY; without even the implied warranty of
623 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
624 GNU General Public License for more details.
626 You should have received a copy of the GNU General Public License
627 along with this program; if not, write to the Free Software
628 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.