removing control characters from XML with reckless abandon
[Evergreen.git] / Open-ILS / src / extras / import / marc2are.pl
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4
5 use lib '/openils/lib/perl5/';
6
7 use OpenSRF::System;
8 use OpenSRF::Application;
9 use OpenSRF::EX qw/:try/;
10 use OpenSRF::AppSession;
11 use OpenSRF::MultiSession;
12 use OpenSRF::Utils::SettingsClient;
13 use OpenILS::Application::AppUtils;
14 use OpenILS::Utils::Fieldmapper;
15 use Digest::MD5 qw/md5_hex/;
16 use OpenSRF::Utils::JSON;
17 use Data::Dumper;
18 use Unicode::Normalize;
19
20 use Time::HiRes qw/time/;
21 use Getopt::Long;
22 use MARC::Batch;
23 use MARC::File::XML ( BinaryEncoding => 'utf-8' );
24 use MARC::Charset;
25
26 MARC::Charset->ignore_errors(1);
27
28 my ($utf8, $id_field, $count, $user, $password, $config, $marctype, $keyfile,  @files, @trash_fields, $quiet) =
29         (0, '998', 1, 'admin', 'open-ils', '/openils/conf/opensrf_core.xml', 'USMARC');
30
31 GetOptions(
32         'startid=i'     => \$count,
33         'user=s'        => \$user,
34         'marctype=s'    => \$marctype,
35         'password=s'    => \$password,
36         'config=s'      => \$config,
37         'file=s'        => \@files,
38         'quiet'         => \$quiet,
39 );
40
41 @files = @ARGV if (!@files);
42
43 my @ses;
44 my @req;
45 my %processing_cache;
46
47 OpenSRF::System->bootstrap_client( config_file => $config );
48 Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
49
50 $user = OpenILS::Application::AppUtils->check_user_session( login($user,$password) )->id;
51
52 select STDERR; $| = 1;
53 select STDOUT; $| = 1;
54
55 my $batch = new MARC::Batch ( $marctype, @files );
56 $batch->strict_off();
57 $batch->warnings_off();
58
59 my $starttime = time;
60 my $rec;
61 while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) {
62         next if ($rec == -1);
63         my $id = $count;
64
65         (my $xml = $rec->as_xml_record()) =~ s/\n//sog;
66         $xml =~ s/^<\?xml.+\?\s*>//go;
67         $xml =~ s/>\s+</></go;
68         $xml =~ s/\p{Cc}//go;
69         $xml = entityize($xml);
70         $xml =~ s/[\x00-\x1f]//go;
71
72         my $bib = new Fieldmapper::authority::record_entry;
73         $bib->id($id);
74         $bib->active('t');
75         $bib->deleted('f');
76         $bib->marc($xml);
77         $bib->creator($user);
78         $bib->create_date('now');
79         $bib->editor($user);
80         $bib->edit_date('now');
81         $bib->arn_source('LEGACY');
82         $bib->arn_value($count);
83         $bib->last_xact_id('IMPORT-'.$starttime);
84
85         print OpenSRF::Utils::JSON->perl2JSON($bib)."\n";
86
87         $count++;
88
89         if (!$quiet && !($count % 20)) {
90                 print STDERR "\r$count\t". $count / (time - $starttime);
91         }
92 }
93
94 sub login {        
95         my( $username, $password, $type ) = @_;
96
97         $type |= "staff"; 
98
99         my $seed = OpenILS::Application::AppUtils->simplereq(
100                 'open-ils.auth',
101                 'open-ils.auth.authenticate.init',
102                 $username
103         );
104
105         die("No auth seed. Couldn't talk to the auth server") unless $seed;
106
107         my $response = OpenILS::Application::AppUtils->simplereq(
108                 'open-ils.auth',
109                 'open-ils.auth.authenticate.complete',
110                 {       username => $username,
111                         password => md5_hex($seed . md5_hex($password)),
112                         type => $type });
113
114         die("No auth response returned on login.") unless $response;
115
116         my $authtime = $response->{payload}->{authtime};
117         my $authtoken = $response->{payload}->{authtoken};
118
119         die("Login failed for user $username!") unless $authtoken;
120
121         return $authtoken;
122 }       
123
124 sub entityize {
125         my $stuff = shift;
126         my $form = shift;
127
128         if ($form and $form eq 'D') {
129                 $stuff = NFD($stuff);
130         } else {
131                 $stuff = NFC($stuff);
132         }
133
134         $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe;
135         return $stuff;
136 }
137