From ddecb4bd099f2306935fe481cdab38223f864283 Mon Sep 17 00:00:00 2001 From: erickson Date: Sun, 27 Feb 2005 16:08:56 +0000 Subject: [PATCH] the marcdumper! takes command line params git-svn-id: svn://svn.open-ils.org/OpenSRF/trunk@121 9efc2488-bf62-4759-914b-345cdb29e865 --- src/extras/marcdumper/Makefile | 17 ++ src/extras/marcdumper/marcdumper.c | 355 +++++++++++++++++++++++++++++ 2 files changed, 372 insertions(+) create mode 100644 src/extras/marcdumper/Makefile create mode 100644 src/extras/marcdumper/marcdumper.c diff --git a/src/extras/marcdumper/Makefile b/src/extras/marcdumper/Makefile new file mode 100644 index 0000000..c1158be --- /dev/null +++ b/src/extras/marcdumper/Makefile @@ -0,0 +1,17 @@ +# +# This utility code requires libxml2 and yaz version 2.0.34 or greater. +# + +CC = gcc +LD = -lxml2 -lyaz +COMP = -O2 -I /usr/include/libxml2/ + +all: marcdumper + +marcdumper: marcdumper.c + $(CC) marcdumper.c $(COMP) $(LD) -o $@ + +clean: + /bin/rm -f *.o marcdumper + + diff --git a/src/extras/marcdumper/marcdumper.c b/src/extras/marcdumper/marcdumper.c new file mode 100644 index 0000000..90adf62 --- /dev/null +++ b/src/extras/marcdumper/marcdumper.c @@ -0,0 +1,355 @@ +/* +* Copyright (C) 1995-2005, Index Data ApS +* See the file LICENSE for details. +* +* $Id$ +*/ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#if HAVE_LOCALE_H +#include +#endif +#if HAVE_LANGINFO_H +#include +#endif + +#include +#include +#include +#include + +#ifndef SEEK_SET +#define SEEK_SET 0 +#endif +#ifndef SEEK_END +#define SEEK_END 2 +#endif + +#include + +char* clean_marc_xpath = "//*[@tag=\"999\"]"; +char* holdings_xpath = "/*/*[(local-name()='datafield' and " + "(@tag!='035' and @tag!='999')) or local-name()!='datafield']"; + +void prune_doc( xmlDocPtr doc, char* xpath ); +char* _xml_to_string( xmlDocPtr doc ); + +static void usage(const char *prog) { + fprintf (stderr, "Usage: %s -r [xpath] -c [cfile] [-f from] [-t to] [-x] [-O] [-X] [-I] [-v] file...\n", prog); +} + +int main (int argc, char **argv) { + int counter = 0; + + int r; + int libxml_dom_test = 0; + int print_offset = 0; + char *arg; + int verbose = 0; + FILE *inf; + char buf[100001]; + char *prog = *argv; + int no = 0; + int xml = 0; + FILE *cfile = 0; + char *from = 0, *to = 0; + int num = 1; + + #if HAVE_LOCALE_H + setlocale(LC_CTYPE, ""); + #endif + #if HAVE_LANGINFO_H + #ifdef CODESET + to = nl_langinfo(CODESET); + #endif + #endif + + char* prune = NULL; + while ((r = options("pvcr:xOeXIf:t:2", argv, argc, &arg)) != -2) { + + int count; + no++; + + switch (r) { + case 'r': + prune = arg; + xmlKeepBlanksDefault(0); + break; + case 'f': + from = arg; + break; + case 't': + to = arg; + break; + case 'c': + if (cfile) + fclose (cfile); + cfile = fopen (arg, "w"); + break; + case 'x': + xml = YAZ_MARC_SIMPLEXML; + break; + case 'O': + xml = YAZ_MARC_OAIMARC; + break; + case 'e': /* not supported on older versions of yaz */ + xml = YAZ_MARC_XCHANGE; + break; + case 'X': + xml = YAZ_MARC_MARCXML; + break; + case 'I': + xml = YAZ_MARC_ISO2709; + break; + case 'p': + print_offset = 1; + break; + case '2': + libxml_dom_test = 1; + break; + case 0: + + inf = fopen (arg, "rb"); + count = 0; + if (!inf) { + fprintf (stderr, "%s: cannot open %s:%s\n", + prog, arg, strerror (errno)); + exit(1); + } + if (cfile) + fprintf (cfile, "char *marc_records[] = {\n"); + + if (1) { + yaz_marc_t mt = yaz_marc_create(); + yaz_iconv_t cd = 0; + + if (from && to) { + cd = yaz_iconv_open(to, from); + if (!cd) { + fprintf(stderr, "conversion from %s to %s " "unsupported\n", from, to); + exit(2); + } + yaz_marc_iconv(mt, cd); + } + yaz_marc_xml(mt, xml); + yaz_marc_debug(mt, verbose); + + while (1) { + counter++; + int len; + char *result; + int rlen; + + r = fread (buf, 1, 5, inf); + + if (r < 5) { + if (r && print_offset) + printf ("Extra %d bytes", r); + break; + } + + if (print_offset) { + long off = ftell(inf); + printf ("Record %d offset %ld\n", num, (long) off); + } + + len = atoi_n(buf, 5); + + if (len < 25 || len > 100000) break; + + len = len - 5; + r = fread (buf + 5, 1, len, inf); + + if (r < len) break; + + r = yaz_marc_decode_buf (mt, buf, -1, &result, &rlen); + + if (r <= 0) break; + + + + if(!prune) { + + fwrite (result, rlen, 1, stdout); + + } else { + + + xmlDocPtr doc = xmlParseMemory(result, rlen); + + if (!doc) { + fprintf(stderr, "xmLParseMemory failed\n"); + continue; + } + + // xmlDocPtr doc_copy = xmlCopyDoc( doc, 1 ); + //char* holdings_expr = "/*/*[(local-name()='datafield' and " + // "(@tag!='035' and @tag!='999')) or local-name()!='datafield']"; + + //char* marc_expr = "//*[@tag=\"999\"]"; + + prune_doc( doc, prune ); + //prune_doc( doc_copy, holdings_expr ); + + char* marc = _xml_to_string(doc); + //char* holdings = _xml_to_string(doc_copy); + + fprintf(stdout, "%s", marc); + //fprintf(stderr, "%s", holdings); + + free(marc); + //free(holdings); + xmlFreeDoc(doc); + //xmlFreeDoc(doc_copy); + + } + + + if (cfile) { + + char *p = buf; + int i; + if (count) + fprintf (cfile, ","); + fprintf (cfile, "\n"); + for (i = 0; i < r; i++) { + if ((i & 15) == 0) + fprintf (cfile, " \""); + fprintf (cfile, "\\x%02X", p[i] & 255); + + if (i < r - 1 && (i & 15) == 15) + fprintf (cfile, "\"\n"); + + } + fprintf (cfile, "\"\n"); + } + num++; + } + + count++; + + if (cd) + yaz_iconv_close(cd); + yaz_marc_destroy(mt); + } + + + if (cfile) + fprintf (cfile, "};\n"); + fclose(inf); + break; + case 'v': + verbose++; + break; + default: + usage(prog); + exit (1); + } + } + + if (cfile) + fclose (cfile); + if (!no) { + usage(prog); + exit (1); + } + + fprintf(stderr, "\nProcessed %d Records\n", counter - 1 ); + exit (0); +} + + +void prune_doc( xmlDocPtr doc, char* xpath ) { + + xmlXPathContextPtr xpathctx; + xmlXPathObjectPtr object; + + xpathctx = xmlXPathNewContext(doc); + if(xpathctx == NULL) { + fprintf(stderr, "XPATH FAILED"); + return; + } + + object = xmlXPathEvalExpression( BAD_CAST xpath, xpathctx); + if(object == NULL) return; + + int i; + int size = object->nodesetval->nodeNr; + for(i=0; i!= size; i++ ) { + xmlNodePtr cur_node = (xmlNodePtr) object->nodesetval->nodeTab[i]; + xmlUnlinkNode( cur_node ); + xmlFreeNode( cur_node ); + } + + /* remove all comments and PI nodes */ + xmlNodePtr cur = doc->children; + while(cur) { + if( cur->type == XML_COMMENT_NODE || cur->type == XML_PI_NODE ) { + xmlUnlinkNode( cur ); + xmlFreeNode( cur ); + } + cur = cur->next; + } + + xmlXPathFreeObject(object); + xmlXPathFreeContext(xpathctx); + +} + +char* _xml_to_string( xmlDocPtr doc ) { + + int bufsize; + xmlChar* xmlbuf; + xmlDocDumpFormatMemory( doc, &xmlbuf, &bufsize, 0 ); + + char* xml = strdup(xmlbuf); + xmlFree(xmlbuf); + + /*** remove the XML declaration */ + int len = strlen(xml); + char tmp[len]; + memset( tmp, 0, len ); + int i; + int found_at = 0; + + /* when we reach the first >, take everything after it */ + for( i = 0; i!= len; i++ ) { + if( xml[i] == 62) { /* ascii > */ + + /* found_at holds the starting index of the rest of the doc*/ + found_at = i + 1; + break; + } + } + + if( found_at ) { + + /* move the shortened doc into the tmp buffer */ + strncpy( tmp, xml + found_at, len - found_at ); + /* move the tmp buffer back into the allocated space */ + memset( xml, 0, len ); + strcpy( xml, tmp ); + } + + int l = strlen(xml)-1; + if( xml[l] == 10 || xml[l] == 13 ) + xml[l] = '\0'; + + return xml; + +} -- 2.43.2