From 240e725c959aa5cfbfc44d02ccb71aaa1e2a754b Mon Sep 17 00:00:00 2001 From: dbs Date: Wed, 20 Feb 2008 04:38:35 +0000 Subject: [PATCH] French MARC tooltip creator for the cataloging interface git-svn-id: svn://svn.open-ils.org/ILS/trunk@8794 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- build/i18n/scripts/marc_tooltip_maker.py | 306 +++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 build/i18n/scripts/marc_tooltip_maker.py diff --git a/build/i18n/scripts/marc_tooltip_maker.py b/build/i18n/scripts/marc_tooltip_maker.py new file mode 100644 index 0000000000..c82d7de401 --- /dev/null +++ b/build/i18n/scripts/marc_tooltip_maker.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : +# vim:et:ts=4:sw=4: + +# Copyright (C) 2008 Laurentian University +# Dan Scott +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + +""" +The MARC editor offers tooltips generated from the Library of Congress Concise +MARC Record documentation available online. + +This script generates a French version of those tooltips based on the Library +and Archives Canada translation of the LoC documentation. +""" + +from BeautifulSoup import BeautifulSoup + +# Simple case: +# Get : map $1 to tag attribute +# From within that A event, retrieve the SMALL event +# If SMALL.cdata == '\s*(R)\s*' then repeatable = yes +# If SMALL.cdata == '\s*(NR)\s*' then repeatable = no +# Get the next P event: map to element +# +# Target: +# +# This field contains 18 character positions (00-17) +# that provide for coding information about special aspects of +# the item being cataloged that cannot be coded in field 008 +# (Fixed-Length Data Elements). It is used in cases when an item +# has multiple characteristics. It is also used to record the coded +# serial aspects of nontextual continuing resources. +# + +# Complex case: + +# field and tag and repeatable description as above +# check for

Indicateurs

before next

+# check for
  • Premier indicateur or
  • Second indicateur to set indicator.position +# check for
  • (\d)\s*-\s*([^<]*)< for indicator.position.value = def__init__ion +# ignore if "Non défini" +# check for

    Codes do sous-zones +# for each
  • : +# CDATA (stripped of tags, with (NR) or (R) stripped out) = field.subfield.def__init__ion +# (NR) or (R) means field.subfield.repeatable = false or true + +# +# An author/title series added entry in which the +# author portion is a personal name. +# +# Forename +# +# +# Surname +# +# +# Family name +# +# +# Personal name +# +# +# Numeration +# + +class MarcCollection(object): + """ + Contains a set of descriptions of MARC fields organized by tag + """ + + def __init__(self): + self.fields = {} + + def add_field(self, field): + """ + Add a MARC field to our collection + """ + self.fields[field.tag] = field + + def to_xml(self): + """ + Convert the MARC field collection to XML representation + """ + xml = "\n" + xml += "\n" + keys = self.fields.keys() + keys.sort() + for key in keys: + xml += self.fields[key].to_xml() + xml += "\n\n" + return xml + +class MarcField(object): + """ + Describes the properties of a MARC field + + You can directly access and manipulate the indicators and subfields lists + """ + def __init__(self, tag, repeatable, description): + self.tag = tag + self.repeatable = repeatable + self.description = description + self.indicators = [] + self.subfields = [] + + def to_xml(self): + """ + Convert the MARC field to XML representation + """ + xml = u" \n" % (self.repeatable, self.tag) + xml += u" %s\n" % (self.description) + for ind in self.indicators: + xml += ind.to_xml() + xml += '\n' + for subfield in self.subfields: + xml += subfield.to_xml() + xml += '\n' + xml += u" \n" + + return xml + +class Subfield(object): + """ + Describes the properties of a MARC subfield + """ + def __init__(self, code, repeatable, description): + self.code = code + self.repeatable = repeatable + self.description = description + + def to_xml(self): + """ + Convert the subfield to XML representation + """ + xml = u" \n" % (self.code, self.repeatable) + xml += u" %s\n" % (self.description) + xml += u" \n" + return xml + +class Indicator(object): + """ + Describes the properties of an indicator-value pair for a MARC field + """ + def __init__(self, position, value, description): + self.position = position + self.value = value + self.description = description + + def to_xml(self): + """ + Convert the indicator-value pair to XML representation + """ + xml = u" \n" % (self.position, self.value) + xml += u" %s\n" % (self.description) + xml += u" \n" + return xml + +def process_indicator(field, position, raw_ind): + """ + Given an XML chunk holding indicator data, + append Indicator objects to a MARC field + """ + if (re.compile(r'indicateur\s*-\s*Non').search(raw_ind.contents[0])): + return None + if (not raw_ind.ul): + print "No %d indicator for %s, although not not defined either..." % (position, field.tag) + return None + ind_values = raw_ind.ul.findAll('li') + for value in ind_values: + text = ''.join(value.findAll(text=True)) + if (re.compile(u'non précisé').search(text)): + continue + matches = re.compile(r'^(\S(-\S)?)\s*-\s*(.+)$', re.S).search(text) + if matches is None: + continue + new_ind = Indicator(position, matches.group(1), matches.group(3)) + field.indicators.append(new_ind) + +def process_subfield(field, subfield): + """ + Given an XML chunk holding subfield data, + append a Subfield object to a MARC field + """ + repeatable = 'true' + + if (subfield.span): + if (re.compile(r'\(R\)').search(subfield.span.renderContents())): + repeatable = 'false' + subfield.span.extract() + elif (subfield.small): + if (re.compile(r'\(R\)').search(subfield.small.renderContents())): + repeatable = 'false' + subfield.small.extract() + else: + print "%s has no small or span tags?" % (field.tag) + + subfield_text = re.compile(r'\n').sub(' ', ''.join(subfield.findAll(text=True))) + matches = re.compile(r'^\$(\w)\s*-\s*(.+)$', re.S).search(subfield_text) + if (not matches): + print "No subfield match for field: " + field.tag + return None + field.subfields.append(Subfield(matches.group(1), repeatable, matches.group(2))) + +def process_tag(tag): + """ + Given a chunk of XML representing a MARC field, generate a MarcField object + """ + repeatable = 'true' + description = u'' + + # Get tag + tag_num = re.compile(r'^mrcb(\d+)').sub(r'\1', tag['id']) + if (len(tag_num) != 3): + return None + + # Get repeatable - most stored in , some stored in + if (re.compile(r'\(NR\)').search(tag.renderContents())): + repeatable = 'false' + + # Get description + desc = tag.parent.findNextSibling('p') + if (not desc): + print "No description for %s" % (tag_num) + else: + if (str(desc.__class__) == 'BeautifulSoup.Tag'): + try: + description += u''.join(desc.findAll(text=True)) + except: + print "Bad description for: " + tag_num + print u' '.join(desc.findAll(text=True)) + else: + description += desc.string + + # Create the tag + field = MarcField(tag_num, repeatable, description) + + for desc in tag.parent.findNextSiblings(): + if (str(desc.__class__) == 'BeautifulSoup.Tag'): + if (desc.name == 'h2'): + break + elif (desc.name == 'h3' and re.compile(r'Indicateurs').search(desc.string)): + # process indicators + first_ind = desc.findNextSibling('ul').li + second_ind = first_ind.findNextSibling('li') + if (not second_ind): + second_ind = first_ind.parent.findNextSibling('ul').li + process_indicator(field, 1, first_ind) + process_indicator(field, 2, second_ind) + elif (desc.name == 'h3' and re.compile(r'Codes de sous').search(desc.string)): + # Get subfields + subfield = desc.findNextSibling('ul').li + while (subfield): + process_subfield(field, subfield) + subfield = subfield.findNextSibling('li') + + return field + +if __name__ == '__main__': + import copy + import os + import re + import subprocess + + ALL_MY_FIELDS = MarcCollection() + + # Run through the LAC-BAC MARC files we care about and convert like crazy + for filename in os.listdir('.'): + + if (not re.compile(r'^040010-1\d\d\d-f.html').search(filename)): + continue + print filename + devnull = open('/dev/null', 'w') + file = subprocess.Popen( + ('tidy', '-asxml', '-n', '-q', '-utf8', filename), + stdout=subprocess.PIPE, stderr=devnull).communicate()[0] + + # Strip out the hard spaces on our way through + hardMassage = [(re.compile(r' '), lambda match: ' ')] + myHardMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) + myHardMassage.extend(myHardMassage) + + filexml = BeautifulSoup(file, markupMassage=myHardMassage) + + tags = filexml.findAll('a', id=re.compile(r'^mrcb')) + for tag in tags: + field = process_tag(tag) + if (field): + ALL_MY_FIELDS.add_field(field) + + MARCOUT = open('marcedit-tooltips-fr.xml', 'w') + MARCOUT.write(ALL_MY_FIELDS.to_xml().encode('UTF-8')) + MARCOUT.close() -- 2.43.2