build/i18n/tests/check_entities.py

   1 #!/usr/bin/env python
   2 # -----------------------------------------------------------------------
   3 # Copyright (C) 2008  Laurentian University
   4 # Dan Scott <dscott@laurentian.ca>
   5 #
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 # -----------------------------------------------------------------------
  16
  17 # vim:et:sw=4:ts=4: set fileencoding=utf-8 :
  18
  19 """
  20 Parse DTD files and XML files looking for trouble
  21     * Missing entities
  22 """
  23
  24 import os
  25 import re
  26
  27 DEBUG = False
  28
  29 DTD_DIRS = (
  30         '../../../Open-ILS/web/opac/locale/en-US/',
  31         )
  32
  33 XML_DIRS = (
  34         '../../../Open-ILS/xul/staff_client/server/',
  35         '../../../Open-ILS/xul/staff_client/chrome/',
  36         '../../../Open-ILS/web/reports/',
  37         )
  38
  39 def parse_entities():
  40     """
  41     Parse entities files in known places
  42     """
  43
  44     basedir = os.path.normpath(os.path.dirname(os.path.abspath(__file__)))
  45
  46     entities = {
  47         "amp" : "&",
  48         "lt" : "<",
  49         "gt" : ">",
  50         "nbsp" : ">",
  51         "quot" : ">",
  52     }
  53
  54     dtd_files = []
  55
  56     for p_dir in DTD_DIRS:
  57         p_dir = os.path.normpath(os.path.join(basedir, p_dir))
  58         file_list = os.listdir(p_dir)
  59         for d_file in file_list:
  60             if os.path.splitext(d_file)[1] == '.dtd':
  61                 dtd_files.append(os.path.join(p_dir, d_file))
  62
  63     prefix = os.path.commonprefix(dtd_files)
  64
  65     for d_file in dtd_files:
  66
  67         # Get the shortest unique address for this file
  68         short_df = d_file[len(prefix):]
  69
  70         dtd_file = open(d_file, 'r')
  71
  72         line_num = 1
  73
  74         for line in dtd_file:
  75             line_num += 1
  76
  77             # Get rid of trailing linefeed
  78             line = line[0:-1]
  79
  80             # Parse entity/value
  81             unpack = re.search(r'<!ENTITY\s+(.+?)\s+([\'"])(.*?)\2\s*>', line)
  82             if DEBUG and unpack:
  83                 print(unpack.groups())
  84
  85             # Skip anything other than entity definitions
  86             # Note that this makes some massive assumptions:
  87             #   1. that we only have with one entity defined per line
  88             #   2. that we only have single-line entities
  89             #   3. that the entity begins in position 0 on the line
  90             if not unpack or not line or not line.startswith('<!ENTITY'):
  91                 continue
  92
  93             # If we did not retrieve an entity and definition, that's probably not good
  94             if len(unpack.groups()) != 3:
  95                 print("%s:%d: No entity defined on line [%s]" % (short_df, line_num, line))
  96                 continue
  97
  98             entity_key, quote, value = unpack.groups()
  99             if DEBUG:
 100                 print(entity_key, value)
 101
 102             if not entities.has_key(entity_key):
 103                 entities[entity_key] = [{'value': value, 'file': short_df}]
 104                 continue
 105
 106             for entry in entities[entity_key]:
 107                 if entry['file'] == short_df:
 108                     print("%s:%d: Duplicate key '%s' in line [%s]" % (short_df, line_num, entity_key, line[0:-1]))
 109                     continue
 110
 111             entities[entity_key].append({'value': value, 'file': short_df})
 112
 113         dtd_file.close()
 114
 115     return entities
 116
 117 def check_files(entities):
 118     """
 119     Finds all the XUL, XHTML, and JavaScript files
 120     """
 121
 122     basedir = os.path.normpath(os.path.dirname(os.path.abspath(__file__)))
 123
 124     xul_files = []
 125
 126     for x_dir in XML_DIRS:
 127         for root, dirs, files in os.walk(x_dir):
 128             for x_file in files:
 129                 if os.path.splitext(x_file)[1] == '.xul' or \
 130                                    os.path.splitext(x_file)[1] == '.js' or \
 131                                    os.path.splitext(x_file)[1] == '.html' or \
 132                                    os.path.splitext(x_file)[1] == '.xhtml':
 133                     check_xul(root, x_file, entities)
 134
 135 def check_xul(root, filename, entities):
 136     """
 137     Checks all XUL files to ensure:
 138       * that the requested entity exists
 139       * that every entity is actually required
 140     """
 141
 142     num_strings = 0
 143
 144     # Typical entity usage:
 145     # &blah.blah.blah_bity.blah;
 146     strings = re.compile(r'''&([a-zA-Z:_][a-zA-Z0-9:_\-.]+);''')
 147
 148     xul = open(os.path.join(root, filename), 'r')
 149     content = xul.read()
 150     xul.close()
 151
 152     if DEBUG:
 153         print("File: %s" % (os.path.normpath(os.path.join(root, filename))))
 154
 155     for s_match in strings.finditer(content):
 156         num_strings += 1
 157         if not entities.has_key(s_match.group(1)):
 158             print("File: %s" % (os.path.normpath(os.path.join(root, filename))))
 159             print("\tEntity %s not found, expected in %s" % (s_match.group(1), 'lang.dtd'))
 160
 161         # Find bad entities
 162         bad_strings = re.compile(r'''&([^a-zA-Z:_]?[a-zA-Z0-9:_]*[^a-zA-Z0-9:_\-.;][a-zA-Z0-9:_\-.]*);''')
 163
 164         # Match character entities (&#0129; etc), which are okay
 165         char_entity = re.compile(r'''^((#([0-9])+)|(#x([0-9a-fA-F])+))$''')
 166
 167         for s_match in bad_strings.finditer(content):
 168                 # Rule out character entities and URL concatenation
 169                 if (not char_entity.search(s_match.group(1))) and s_match.group(1) != "'":
 170                         print("File: %s" % (os.path.normpath(os.path.join(root, filename))))
 171                         print("\tBad entity: %s" % (s_match.group(1)))
 172
 173     if DEBUG:
 174         print("\t%d entities found" % (num_strings))
 175
 176 if __name__ == '__main__':
 177     entities = parse_entities()
 178     check_files(entities)