2 # Copyright (C) 2010-2011 Laurentian University
3 # Author: Dan Scott <dscott@laurentian.ca>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 # ---------------------------------------------------------------
21 use MARC::File::XML (BinaryEncoding => 'UTF-8');
24 use OpenILS::Utils::Fieldmapper;
25 use OpenSRF::Utils::SettingsClient;
26 use OpenSRF::EX qw/:try/;
28 use Unicode::Normalize;
29 use OpenILS::Application::AppUtils;
31 use Pod::Usage qw/ pod2usage /;
33 MARC::Charset->assume_unicode(1);
35 my ($start_id, $end_id, $refresh);
37 my $bootstrap = '@sysconfdir@/opensrf_core.xml';
42 my $result = GetOptions(
44 'configuration=s' => \$bootstrap,
45 'record=i' => \@records,
46 'refresh' => \$refresh,
48 'start_id=i' => \$start_id,
49 'end_id=i' => \$end_id,
50 'days_back=i' => \$days_back,
53 if (!$result or $options{help}) {
57 if ($start_id && $days_back) {
58 print "Can't use both start ID and days back!\n";
62 OpenSRF::System->bootstrap_client(config_file => $bootstrap);
63 Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
65 # must be loaded and initialized after the IDL is parsed
66 use OpenILS::Utils::CStoreEditor;
67 OpenILS::Utils::CStoreEditor::init();
68 my $e = OpenILS::Utils::CStoreEditor->new;
71 # Grab DB information from local settings
72 my $sc = OpenSRF::Utils::SettingsClient->new;
73 my $db_driver = $sc->config_value( reporter => setup => database => 'driver' );
74 my $db_host = $sc->config_value( reporter => setup => database => 'host' );
75 my $db_port = $sc->config_value( reporter => setup => database => 'port' );
76 my $db_name = $sc->config_value( reporter => setup => database => 'db' );
78 $db_name = $sc->config_value( reporter => setup => database => 'name' );
79 print STDERR "WARN: <database><name> is a deprecated setting for database name. For future compatibility, you should use <database><db> instead." if $db_name;
81 my $db_user = $sc->config_value( reporter => setup => database => 'user' );
82 my $db_pw = $sc->config_value( reporter => setup => database => 'pw' );
83 die "Unable to retrieve database connection information from the settings server" unless ($db_driver && $db_host && $db_port && $db_name && $db_user);
84 my $dsn = "dbi:" . $db_driver . ":dbname=" . $db_name .';host=' . $db_host . ';port=' . $db_port;
85 my $dbh = DBI->connect($dsn,$db_user,$db_pw, {AutoCommit => 1, pg_enable_utf8 => 1, RaiseError => 1}) or die "database connection error";
90 # SQL Used to gather a list of ID's
91 $idstatement = $dbh->prepare("SELECT DISTINCT(id) AS id FROM biblio.record_entry
92 WHERE deleted IS FALSE ORDER BY ID DESC");
94 # Load the list of ID's into the records array
95 $idstatement->execute();
96 while (my $ref = $idstatement->fetchrow_hashref()) {
97 my $id_ref = $ref->{"id"}; # the column name in our sql query is "id"
98 push(@records, $id_ref);
102 if ($start_id and $end_id) {
103 @records = ($start_id .. $end_id);
106 if (defined $days_back) {
109 # SQL Used to gather a list of ID's
110 $idstatement = $dbh->prepare("SELECT DISTINCT(id) AS id FROM biblio.record_entry
111 WHERE deleted IS FALSE AND date(edit_date) >= date((NOW() - '$days_back day'::interval))
114 # Load the list of ID's into the records array
115 $idstatement->execute();
116 while (my $ref = $idstatement->fetchrow_hashref()) {
117 my $id_ref = $ref->{"id"}; # the column name in our sql query is "id"
118 push(@records, $id_ref);
122 # print Dumper($undeleted, \@records);
124 # Hash of controlled fields & subfields in bibliographic records, and their
125 # corresponding controlling fields & subfields in the authority record
127 # So, if the bib 650$a can be controlled by an auth 150$a, that maps to:
128 # 650 => { a => { 150 => 'a'}}
130 100 => { a => { 100 => 'a' },
145 110 => { a => { 110 => 'a' },
158 111 => { a => { 111 => 'a' },
173 130 => { a => { 130 => 'a' },
188 600 => { a => { 100 => 'a' },
211 610 => { a => { 110 => 'a' },
232 611 => { a => { 111 => 'a' },
252 630 => { a => { 130 => 'a' },
271 648 => { a => { 148 => 'a' },
277 650 => { a => { 150 => 'a' },
284 651 => { a => { 151 => 'a' },
290 655 => { a => { 155 => 'a' },
296 700 => { a => { 100 => 'a' },
311 710 => { a => { 110 => 'a' },
324 711 => { a => { 111 => 'a' },
339 730 => { a => { 130 => 'a' },
354 751 => { a => { 151 => 'a' },
360 800 => { a => { 100 => 'a' },
377 830 => { a => { 130 => 'a' },
394 my $rec_count = @records;
396 foreach my $rec_id (@records) {
398 #print "record: $rec_id $i of $rec_count\n";
400 # State variable; was the record changed?
404 my $record = $e->retrieve_biblio_record_entry($rec_id);
406 # print Dumper($record);
409 my $marc = MARC::Record->new_from_xml($record->marc());
411 # get the list of controlled fields
412 my @c_fields = keys %controllees;
414 foreach my $c_tag (@c_fields) {
415 my @c_subfields = keys %{$controllees{"$c_tag"}};
416 # print "Field: $field subfields: ";
417 # foreach (@subfields) { print "$_ "; }
419 # Get the MARCXML from the record and check for controlled fields/subfields
420 my @bib_fields = ($marc->field($c_tag));
421 foreach my $bib_field (@bib_fields) {
422 # print $_->as_formatted();
424 if ($refresh and defined(scalar($bib_field->subfield('0')))) {
425 $bib_field->delete_subfield(code => '0');
432 foreach my $c_subfield (@c_subfields) {
433 my @sf_values = $bib_field->subfield($c_subfield);
435 # Give me the first element of the list of authority controlling tags for this subfield
436 # XXX Will we need to support more than one controlling tag per subfield? Probably. That
437 # will suck. Oh well, leave that up to Ole to implement.
438 $match_subfields{$c_subfield} = (keys %{$controllees{$c_tag}{$c_subfield}})[0];
439 $match_tag = $match_subfields{$c_subfield};
440 push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values;
443 # print Dumper(\%match_subfields);
446 my @tags = ($match_tag);
448 # print "Controlling tag: $c_tag and match tag $match_tag\n";
449 # print Dumper(\@tags, \@searches);
451 # Now we've built up a complete set of matching controlled
452 # subfields for this particular field; let's check to see if
453 # we have a matching authority record
454 my $session = OpenSRF::AppSession->create("open-ils.search");
455 my $validates = $session->request("open-ils.search.authority.validate.tag.id_list",
456 "tags", \@tags, "searches", \@searches
458 $session->disconnect();
460 # print Dumper($validates);
462 # Protect against failed (error condition) search request
464 print STDERR "Search for matching authority failed; record # $rec_id\n";
468 # Only add linking if one or more was found, but we may have changed
469 # the record already if in --refresh mode.
470 if (scalar(@$validates) > 0) {
472 # Iterate through the returned authority record IDs to delete any
473 # matching $0 subfields already in the bib record
474 foreach my $auth_zero (@$validates) {
475 $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/);
478 # Okay, we have a matching authority control; time to
479 # add the magical subfield 0. Use the first returned auth
481 my $auth_id = @$validates[0];
482 my $auth_rec = $e->retrieve_authority_record_entry($auth_id);
483 my $auth_marc = MARC::Record->new_from_xml($auth_rec->marc());
484 my $cni = $auth_marc->field('003')->data();
486 $bib_field->add_subfields('0' => "($cni)$auth_id");
492 my $editor = OpenILS::Utils::CStoreEditor->new(xact=>1);
493 # print $marc->as_formatted();
494 my $xml = $marc->as_xml_record();
496 $xml =~ s/^<\?xml.+\?\s*>//go;
497 $xml =~ s/>\s+</></go;
498 $xml =~ s/\p{Cc}//go;
499 $xml = OpenILS::Application::AppUtils->entityize($xml);
502 $editor->update_biblio_record_entry($record);
507 print STDERR "\nRecord # $rec_id : $err\n";
508 import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire process
516 authority_control_fields.pl - Controls fields in bibliographic records with authorities in Evergreen
520 C<authority_control_fields.pl> [B<--configuration>=I<opensrf_core.conf>] [B<--refresh>]
521 [[B<--record>=I<record>[ B<--record>=I<record>]]] | [B<--all>] | [B<--start_id>=I<start-ID> B<--end_id>=I<end-ID>] |
522 [B<--days_back>=I<number-of-days>]
526 For a given set of records:
530 =item * Iterate through the list of fields that are controlled fields
532 =item * Iterate through the list of subfields that are controlled for
535 =item * Search for a matching authority record for that combination of
540 =item * If we find a match, then add a $0 subfield to that field identifying
541 the controlling authority record
543 =item * If we do not find a match, then insert a row into an "uncontrolled"
544 table identifying the record ID, field, and subfield(s) that were not controlled
548 =item * Iterate through the list of floating subdivisions
552 =item * If we find a match, then add a $0 subfield to that field identifying
553 the controlling authority record
555 =item * If we do not find a match, then insert a row into an "uncontrolled"
556 table identifying the record ID, field, and subfield(s) that were not controlled
560 =item * If we changed the record, update it in the database
568 =item * B<-c> I<config-file>, B<--configuration>=I<config-file>
570 Specifies the OpenSRF configuration file used to connect to the OpenSRF router.
571 Defaults to F<@sysconfdir@/opensrf_core.xml>
573 =item * B<-r> I<record-ID>, B<--record>=I<record-ID>
575 Specifies the bibliographic record ID (found in the C<biblio.record_entry.id>
576 column) of the record to process. This option may be specified more than once
577 to process multiple records in a single run.
579 =item * B<-a>, B<--all>
581 Specifies that all bibliographic records should be processed. For large
582 databases, this may take an extraordinarily long amount of time.
584 =item * B<-r>, B<--refresh>
586 Specifies that all authority links should be removed from the target
587 bibliographic record(s). This will effectively rewrite all authority
590 =item * B<-s> I<start-ID>, B<--start_id>=I<start-ID>
592 Specifies the starting ID of the range of bibliographic records to process.
593 This option is ignored unless it is accompanied by the B<-e> or B<--end_id>
596 =item * B<-e> I<end-ID>, B<--end_id>=I<end-ID>
598 Specifies the ending ID of the range of bibliographic records to process.
599 This option is ignored unless it is accompanied by the B<-s> or B<--start_id>
602 =item * B<--days_back>=I<number-of-days>
604 Specifies that only bibliographic records that have been created in the
605 past few days should be processed. You must specify how many days back
606 to include. This option is incompatible with the B<-s> and B<--start_id>
613 authority_control_fields.pl --start_id 1 --end_id 50000
615 Processes the bibliographic records with IDs between 1 and 50,000 using the
616 default OpenSRF configuration file for connection information.
620 Dan Scott <dscott@laurentian.ca>
622 =head1 COPYRIGHT AND LICENSE
624 Copyright 2010-2011 by Dan Scott
626 This program is free software; you can redistribute it and/or
627 modify it under the terms of the GNU General Public License
628 as published by the Free Software Foundation; either version 2
629 of the License, or (at your option) any later version.
631 This program is distributed in the hope that it will be useful,
632 but WITHOUT ANY WARRANTY; without even the implied warranty of
633 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
634 GNU General Public License for more details.
636 You should have received a copy of the GNU General Public License
637 along with this program; if not, write to the Free Software
638 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.