2 # vim: set fileencoding=utf-8 :
5 # Copyright (C) 2008 Laurentian University
6 # Dan Scott <dscott@laurentian.ca>
8 # This program is free software; you can redistribute it and/or
9 # modify it under the terms of the GNU General Public License
10 # as published by the Free Software Foundation; either version 2
11 # of the License, or (at your option) any later version.
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 The MARC editor offers tooltips generated from the Library of Congress Concise
24 MARC Record documentation available online.
26 This script generates a French version of those tooltips based on the Library
27 and Archives Canada translation of the LoC documentation.
30 from BeautifulSoup import BeautifulSoup
33 # Get <a id="#mrcb(###)">: map $1 to tag attribute
34 # From within that A event, retrieve the SMALL event
35 # If SMALL.cdata == '\s*(R)\s*' then repeatable = yes
36 # If SMALL.cdata == '\s*(NR)\s*' then repeatable = no
37 # Get the next P event: map to <description> element
40 # <field repeatable="true" tag="006">
41 # <description>This field contains 18 character positions (00-17)
42 # that provide for coding information about special aspects of
43 # the item being cataloged that cannot be coded in field 008
44 # (Fixed-Length Data Elements). It is used in cases when an item
45 # has multiple characteristics. It is also used to record the coded
46 # serial aspects of nontextual continuing resources.</description>
51 # field and tag and repeatable description as above
52 # check for <h3>Indicateurs</h3> before next <h2>
53 # check for <li>Premier indicateur or <li>Second indicateur to set indicator.position
54 # check for <li class="sqf">(\d)\s*-\s*([^<]*)< for indicator.position.value = def__init__ion
55 # ignore if "Non défini"
56 # check for <h3>Codes do sous-zones
58 # CDATA (stripped of tags, with (NR) or (R) stripped out) = field.subfield.def__init__ion
59 # (NR) or (R) means field.subfield.repeatable = false or true
61 # <field repeatable="true" tag="800">
62 # <description>An author/title series added entry in which the
63 # author portion is a personal name.</description>
64 # <indicator position="1" value="0">
65 # <description>Forename</description>
67 # <indicator position="1" value="1">
68 # <description>Surname</description>
70 # <indicator position="1" value="3">
71 # <description>Family name</description>
73 # <subfield code="a" repeatable="false">
74 # <description>Personal name </description>
76 # <subfield code="b" repeatable="false">
77 # <description>Numeration </description>
80 class MarcCollection(object):
82 Contains a set of descriptions of MARC fields organized by tag
88 def add_field(self, field):
90 Add a MARC field to our collection
92 self.fields[field.tag] = field
96 Convert the MARC field collection to XML representation
98 xml = "<?xml version='1.0' encoding='utf-8'?>\n"
100 keys = self.fields.keys()
103 xml += self.fields[key].to_xml()
104 xml += "\n</fields>\n"
107 class MarcField(object):
109 Describes the properties of a MARC field
111 You can directly access and manipulate the indicators and subfields lists
113 def __init__(self, tag, name, repeatable, description):
116 self.repeatable = repeatable
117 self.description = description
123 Convert the MARC field to XML representation
125 xml = u" <field repeatable='%s' tag='%s'>\n" % (self.repeatable, self.tag)
126 xml += u" <name>%s</name>\n" % (self.name)
127 xml += u" <description>%s</description>\n" % (self.description)
128 for ind in self.indicators:
131 for subfield in self.subfields:
132 xml += subfield.to_xml()
134 xml += u" </field>\n"
138 class Subfield(object):
140 Describes the properties of a MARC subfield
142 def __init__(self, code, repeatable, description):
144 self.repeatable = repeatable
145 self.description = description
149 Convert the subfield to XML representation
151 xml = u" <subfield code='%s' repeatable='%s'>\n" % (self.code, self.repeatable)
152 xml += u" <description>%s</description>\n" % (self.description)
153 xml += u" </subfield>\n"
156 class Indicator(object):
158 Describes the properties of an indicator-value pair for a MARC field
160 def __init__(self, position, value, description):
161 self.position = position
163 self.description = description
167 Convert the indicator-value pair to XML representation
169 xml = u" <indicator position='%s' value='%s'>\n" % (self.position, self.value)
170 xml += u" <description>%s</description>\n" % (self.description)
171 xml += u" </indicator>\n"
174 def process_indicator(field, position, raw_ind):
176 Given an XML chunk holding indicator data,
177 append Indicator objects to a MARC field
179 if (re.compile(r'indicateur\s*-\s*Non').search(raw_ind.contents[0])):
182 print "No %d indicator for %s, although not not defined either..." % (position, field.tag)
184 ind_values = raw_ind.ul.findAll('li')
185 for value in ind_values:
186 text = ''.join(value.findAll(text=True))
187 if (re.compile(u'non précisé').search(text)):
189 matches = re.compile(r'^(\S(-\S)?)\s*-\s*(.+)$', re.S).search(text)
192 new_ind = Indicator(position, matches.group(1).replace('\n', ' ').rstrip(), matches.group(3).replace('\n', ' ').rstrip())
193 field.indicators.append(new_ind)
195 def process_subfield(field, subfield):
197 Given an XML chunk holding subfield data,
198 append a Subfield object to a MARC field
203 if (re.compile(r'\(R\)').search(subfield.span.renderContents())):
205 subfield.span.extract()
206 elif (subfield.small):
207 if (re.compile(r'\(R\)').search(subfield.small.renderContents())):
209 subfield.small.extract()
211 print "%s has no small or span tags?" % (field.tag)
213 subfield_text = re.compile(r'\n').sub(' ', ''.join(subfield.findAll(text=True)))
214 matches = re.compile(r'^\$(\w)\s*-\s*(.+)$', re.S).search(subfield_text)
216 print "No subfield match for field: " + field.tag
218 field.subfields.append(Subfield(matches.group(1).replace('\n', ' ').rstrip(), repeatable, matches.group(2).replace('\n', ' ').rstrip()))
220 def process_tag(tag):
222 Given a chunk of XML representing a MARC field, generate a MarcField object
229 tag_num = re.compile(r'^mrcb(\d+)').sub(r'\1', tag['id'])
230 if (len(tag_num) != 3):
233 # Get repeatable - most stored in <span>, some stored in <small>
234 if (re.compile(r'\(NR\)').search(tag.renderContents())):
237 # Get name - stored in <h2> like:
238 # <h2><a id="mrcb250">250 - Mention d'édition <span class="small">(NR)</span></a>
239 name = re.compile(r'^.+?-\s*(.+)\s*\(.+$', re.S).sub(r'\1', ''.join(tag.findAll(text=True)))
240 name = name.replace('\n', ' ').rstrip()
243 desc = tag.parent.findNextSibling('p')
245 print "No description for %s" % (tag_num)
247 if (str(desc.__class__) == 'BeautifulSoup.Tag'):
249 description += u''.join(desc.findAll(text=True))
251 print "Bad description for: " + tag_num
252 print u' '.join(desc.findAll(text=True))
254 description += desc.string
255 description = description.replace('\n', ' ').rstrip()
258 field = MarcField(tag_num, name, repeatable, description)
260 for desc in tag.parent.findNextSiblings():
261 if (str(desc.__class__) == 'BeautifulSoup.Tag'):
262 if (desc.name == 'h2'):
264 elif (desc.name == 'h3' and re.compile(r'Indicateurs').search(desc.string)):
266 first_ind = desc.findNextSibling('ul').li
267 second_ind = first_ind.findNextSibling('li')
269 second_ind = first_ind.parent.findNextSibling('ul').li
270 process_indicator(field, 1, first_ind)
271 process_indicator(field, 2, second_ind)
272 elif (desc.name == 'h3' and re.compile(r'Codes de sous').search(desc.string)):
274 subfield = desc.findNextSibling('ul').li
276 process_subfield(field, subfield)
277 subfield = subfield.findNextSibling('li')
281 if __name__ == '__main__':
288 ALL_MY_FIELDS = MarcCollection()
290 # Run through the LAC-BAC MARC files we care about and convert like crazy
291 for filename in os.listdir('.'):
293 if (not re.compile(r'^040010-1\d\d\d-f.html').search(filename)):
296 devnull = codecs.open('/dev/null', encoding='utf-8', mode='w')
297 file = subprocess.Popen(
298 ('tidy', '-asxml', '-n', '-q', '-utf8', filename),
299 stdout=subprocess.PIPE, stderr=devnull).communicate()[0]
301 # Strip out the hard spaces on our way through
302 hardMassage = [(re.compile(r' '), lambda match: ' ')]
303 myHardMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
304 myHardMassage.extend(myHardMassage)
306 filexml = BeautifulSoup(file, markupMassage=myHardMassage)
308 tags = filexml.findAll('a', id=re.compile(r'^mrcb'))
310 field = process_tag(tag)
312 ALL_MY_FIELDS.add_field(field)
314 MARCOUT = codecs.open('marcedit-tooltips-fr.xml', encoding='utf-8', mode='w')
315 MARCOUT.write(ALL_MY_FIELDS.to_xml().encode('UTF-8'))