"Fossies" - the Fresh Open Source Software Archive

Member "icu/source/python/icutools/databuilder/filtration.py" (22 Apr 2020, 14662 Bytes) of package /linux/misc/icu4c-67_1-src.tgz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "filtration.py" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes reports: 67rc_vs_67_1 or 66_1_vs_67_1.

    1 # Copyright (C) 2018 and later: Unicode, Inc. and others.
    2 # License & terms of use: http://www.unicode.org/copyright.html
    3 
    4 # Python 2/3 Compatibility (ICU-20299)
    5 # TODO(ICU-20301): Remove this.
    6 from __future__ import print_function
    7 
    8 from abc import abstractmethod
    9 from collections import defaultdict
   10 import re
   11 import sys
   12 
   13 from . import *
   14 from . import utils
   15 from .request_types import *
   16 
   17 
   18 # Note: for this to be a proper abstract class, it should extend abc.ABC.
   19 # There is no nice way to do this that works in both Python 2 and 3.
   20 # TODO(ICU-20301): Make this inherit from abc.ABC.
   21 class Filter(object):
   22     @staticmethod
   23     def create_from_json(json_data, io):
   24         assert io != None
   25         if "filterType" in json_data:
   26             filter_type = json_data["filterType"]
   27         else:
   28             filter_type = "file-stem"
   29 
   30         if filter_type == "file-stem":
   31             return FileStemFilter(json_data)
   32         elif filter_type == "language":
   33             return LanguageFilter(json_data)
   34         elif filter_type == "regex":
   35             return RegexFilter(json_data)
   36         elif filter_type == "exclude":
   37             return ExclusionFilter()
   38         elif filter_type == "union":
   39             return UnionFilter(json_data, io)
   40         elif filter_type == "locale":
   41             return LocaleFilter(json_data, io)
   42         else:
   43             print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr)
   44             return None
   45 
   46     def filter(self, request):
   47         if not request.apply_file_filter(self):
   48             return []
   49         for file in request.all_input_files():
   50             assert self.match(file)
   51         return [request]
   52 
   53     @staticmethod
   54     def _file_to_file_stem(file):
   55         start = file.filename.rfind("/")
   56         limit = file.filename.rfind(".")
   57         return file.filename[start+1:limit]
   58 
   59     @staticmethod
   60     def _file_to_subdir(file):
   61         limit = file.filename.rfind("/")
   62         if limit == -1:
   63             return None
   64         return file.filename[:limit]
   65 
   66     @abstractmethod
   67     def match(self, file):
   68         pass
   69 
   70 
   71 class InclusionFilter(Filter):
   72     def match(self, file):
   73         return True
   74 
   75 
   76 class ExclusionFilter(Filter):
   77     def match(self, file):
   78         return False
   79 
   80 
   81 class WhitelistBlacklistFilter(Filter):
   82     def __init__(self, json_data):
   83         if "whitelist" in json_data:
   84             self.is_whitelist = True
   85             self.whitelist = json_data["whitelist"]
   86         else:
   87             assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data)
   88             self.is_whitelist = False
   89             self.blacklist = json_data["blacklist"]
   90 
   91     def match(self, file):
   92         file_stem = self._file_to_file_stem(file)
   93         return self._should_include(file_stem)
   94 
   95     @abstractmethod
   96     def _should_include(self, file_stem):
   97         pass
   98 
   99 
  100 class FileStemFilter(WhitelistBlacklistFilter):
  101     def _should_include(self, file_stem):
  102         if self.is_whitelist:
  103             return file_stem in self.whitelist
  104         else:
  105             return file_stem not in self.blacklist
  106 
  107 
  108 class LanguageFilter(WhitelistBlacklistFilter):
  109     def _should_include(self, file_stem):
  110         language = file_stem.split("_")[0]
  111         if language == "root":
  112             # Always include root.txt
  113             return True
  114         if self.is_whitelist:
  115             return language in self.whitelist
  116         else:
  117             return language not in self.blacklist
  118 
  119 
  120 class RegexFilter(WhitelistBlacklistFilter):
  121     def __init__(self, *args):
  122         # TODO(ICU-20301): Change this to: super().__init__(*args)
  123         super(RegexFilter, self).__init__(*args)
  124         if self.is_whitelist:
  125             self.whitelist = [re.compile(pat) for pat in self.whitelist]
  126         else:
  127             self.blacklist = [re.compile(pat) for pat in self.blacklist]
  128 
  129     def _should_include(self, file_stem):
  130         if self.is_whitelist:
  131             for pattern in self.whitelist:
  132                 if pattern.match(file_stem):
  133                     return True
  134             return False
  135         else:
  136             for pattern in self.blacklist:
  137                 if pattern.match(file_stem):
  138                     return False
  139             return True
  140 
  141 
  142 class UnionFilter(Filter):
  143     def __init__(self, json_data, io):
  144         # Collect the sub-filters.
  145         self.sub_filters = []
  146         for filter_json in json_data["unionOf"]:
  147             self.sub_filters.append(Filter.create_from_json(filter_json, io))
  148 
  149     def match(self, file):
  150         """Match iff any of the sub-filters match."""
  151         for filter in self.sub_filters:
  152             if filter.match(file):
  153                 return True
  154         return False
  155 
  156 
  157 LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$")
  158 LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
  159 
  160 class LocaleFilter(Filter):
  161     def __init__(self, json_data, io):
  162         self.locales_requested = list(json_data["whitelist"])
  163         self.include_children = json_data.get("includeChildren", True)
  164         self.include_scripts = json_data.get("includeScripts", False)
  165 
  166         # Load the dependency graph from disk
  167         self.dependency_data_by_tree = {
  168             tree: io.read_locale_deps(tree)
  169             for tree in utils.ALL_TREES
  170         }
  171 
  172     def match(self, file):
  173         tree = self._file_to_subdir(file)
  174         assert tree is not None
  175         locale = self._file_to_file_stem(file)
  176 
  177         # A locale is *required* if it is *requested* or an ancestor of a
  178         # *requested* locale.
  179         if locale in self._locales_required(tree):
  180             return True
  181 
  182         # Resolve include_scripts and include_children.
  183         return self._match_recursive(locale, tree)
  184 
  185     def _match_recursive(self, locale, tree):
  186         # Base case: return True if we reached a *requested* locale,
  187         # or False if we ascend out of the locale tree.
  188         if locale is None:
  189             return False
  190         if locale in self.locales_requested:
  191             return True
  192 
  193         # Check for alternative scripts.
  194         # This causes sr_Latn to check sr instead of going directly to root.
  195         if self.include_scripts:
  196             match = LANGUAGE_SCRIPT_REGEX.match(locale)
  197             if match and self._match_recursive(match.group(1), tree):
  198                 return True
  199 
  200         # Check if we are a descendant of a *requested* locale.
  201         if self.include_children:
  202             parent = self._get_parent_locale(locale, tree)
  203             if self._match_recursive(parent, tree):
  204                 return True
  205 
  206         # No matches.
  207         return False
  208 
  209     def _get_parent_locale(self, locale, tree):
  210         """Gets the parent locale in the given tree, according to dependency data."""
  211         dependency_data = self.dependency_data_by_tree[tree]
  212         if "parents" in dependency_data and locale in dependency_data["parents"]:
  213             return dependency_data["parents"][locale]
  214         if "aliases" in dependency_data and locale in dependency_data["aliases"]:
  215             return dependency_data["aliases"][locale]
  216         if LANGUAGE_ONLY_REGEX.match(locale):
  217             return "root"
  218         i = locale.rfind("_")
  219         if i < 0:
  220             assert locale == "root", "Invalid locale: %s/%s" % (tree, locale)
  221             return None
  222         return locale[:i]
  223 
  224     def _locales_required(self, tree):
  225         """Returns a generator of all required locales in the given tree."""
  226         for locale in self.locales_requested:
  227             while locale is not None:
  228                 yield locale
  229                 locale = self._get_parent_locale(locale, tree)
  230 
  231 
  232 def apply_filters(requests, config, io):
  233     """Runs the filters and returns a new list of requests."""
  234     requests = _apply_file_filters(requests, config, io)
  235     requests = _apply_resource_filters(requests, config, io)
  236     return requests
  237 
  238 
  239 def _apply_file_filters(old_requests, config, io):
  240     """Filters out entire files."""
  241     filters = _preprocess_file_filters(old_requests, config, io)
  242     new_requests = []
  243     for request in old_requests:
  244         category = request.category
  245         if category in filters:
  246             new_requests += filters[category].filter(request)
  247         else:
  248             new_requests.append(request)
  249     return new_requests
  250 
  251 
  252 def _preprocess_file_filters(requests, config, io):
  253     all_categories = set(
  254         request.category
  255         for request in requests
  256     )
  257     all_categories.remove(None)
  258     all_categories = list(sorted(all_categories))
  259     json_data = config.filters_json_data
  260     filters = {}
  261     default_filter_json = "exclude" if config.strategy == "additive" else "include"
  262     for category in all_categories:
  263         filter_json = default_filter_json
  264         # Figure out the correct filter to create
  265         if "featureFilters" in json_data and category in json_data["featureFilters"]:
  266             filter_json = json_data["featureFilters"][category]
  267         if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"):
  268             filter_json = json_data["localeFilter"]
  269         # Resolve the filter JSON into a filter object
  270         if filter_json == "exclude":
  271             filters[category] = ExclusionFilter()
  272         elif filter_json == "include":
  273             pass  # no-op
  274         else:
  275             filters[category] = Filter.create_from_json(filter_json, io)
  276     if "featureFilters" in json_data:
  277         for category in json_data["featureFilters"]:
  278             if category not in all_categories:
  279                 print("Warning: category %s is not known" % category, file=sys.stderr)
  280     return filters
  281 
  282 
  283 class ResourceFilterInfo(object):
  284     def __init__(self, category, strategy):
  285         self.category = category
  286         self.strategy = strategy
  287         self.filter_tmp_dir = "filters/%s" % category
  288         self.input_files = None
  289         self.filter_files = None
  290         self.rules_by_file = None
  291 
  292     def apply_to_requests(self, all_requests):
  293         # Call this method only once per list of requests.
  294         assert self.input_files is None
  295         for request in all_requests:
  296             if request.category != self.category:
  297                 continue
  298             if not isinstance(request, AbstractExecutionRequest):
  299                 continue
  300             if request.tool != IcuTool("genrb"):
  301                 continue
  302             if not request.input_files:
  303                 continue
  304             self._set_files(request.input_files)
  305             request.dep_targets += [self.filter_files[:]]
  306             arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir
  307             request.args = "%s %s" % (arg_str, request.args)
  308 
  309         # Make sure we found the target request
  310         if self.input_files is None:
  311             print("WARNING: Category not found: %s" % self.category, file=sys.stderr)
  312             self.input_files = []
  313             self.filter_files = []
  314             self.rules_by_file = []
  315 
  316     def _set_files(self, files):
  317         # Note: The input files to genrb for a certain category should always
  318         # be the same. For example, there are often two genrb calls: one for
  319         # --writePoolBundle, and the other for --usePoolBundle. They are both
  320         # expected to have the same list of input files.
  321         if self.input_files is not None:
  322             assert self.input_files == files
  323             return
  324         self.input_files = list(files)
  325         self.filter_files = [
  326             TmpFile("%s/%s" % (self.filter_tmp_dir, basename))
  327             for basename in (
  328                 file.filename[file.filename.rfind("/")+1:]
  329                 for file in files
  330             )
  331         ]
  332         if self.strategy == "additive":
  333             self.rules_by_file = [
  334                 [r"-/", r"+/%%ALIAS", r"+/%%Parent"]
  335                 for _ in range(len(files))
  336             ]
  337         else:
  338             self.rules_by_file = [
  339                 [r"+/"]
  340                 for _ in range(len(files))
  341             ]
  342 
  343     def add_rules(self, file_filter, rules):
  344         for file, rule_list in zip(self.input_files, self.rules_by_file):
  345             if file_filter.match(file):
  346                 rule_list += rules
  347 
  348     def make_requests(self):
  349         # Map from rule list to filter files with that rule list
  350         unique_rules = defaultdict(list)
  351         for filter_file, rules in zip(self.filter_files, self.rules_by_file):
  352             unique_rules[tuple(rules)].append(filter_file)
  353 
  354         new_requests = []
  355         i = 0
  356         for rules, filter_files in unique_rules.items():
  357             base_filter_file = filter_files[0]
  358             new_requests += [
  359                 PrintFileRequest(
  360                     name = "%s_print_%d" % (self.category, i),
  361                     output_file = base_filter_file,
  362                     content = self._generate_resource_filter_txt(rules)
  363                 )
  364             ]
  365             i += 1
  366             for filter_file in filter_files[1:]:
  367                 new_requests += [
  368                     CopyRequest(
  369                         name = "%s_copy_%d" % (self.category, i),
  370                         input_file = base_filter_file,
  371                         output_file = filter_file
  372                     )
  373                 ]
  374                 i += 1
  375         return new_requests
  376 
  377     @staticmethod
  378     def _generate_resource_filter_txt(rules):
  379         result = "# Caution: This file is automatically generated\n\n"
  380         result += "\n".join(rules)
  381         return result
  382 
  383 
  384 def _apply_resource_filters(all_requests, config, io):
  385     """Creates filters for looking within resource bundle files."""
  386     json_data = config.filters_json_data
  387     if "resourceFilters" not in json_data:
  388         return all_requests
  389 
  390     collected = {}
  391     for entry in json_data["resourceFilters"]:
  392         if "files" in entry:
  393             file_filter = Filter.create_from_json(entry["files"], io)
  394         else:
  395             file_filter = InclusionFilter()
  396         for category in entry["categories"]:
  397             # not defaultdict because we need to pass arguments to the constructor
  398             if category not in collected:
  399                 filter_info = ResourceFilterInfo(category, config.strategy)
  400                 filter_info.apply_to_requests(all_requests)
  401                 collected[category] = filter_info
  402             else:
  403                 filter_info = collected[category]
  404             filter_info.add_rules(file_filter, entry["rules"])
  405 
  406     # Add the filter generation requests to the beginning so that by default
  407     # they are made before genrb gets run (order is required by windirect)
  408     new_requests = []
  409     for filter_info in collected.values():
  410         new_requests += filter_info.make_requests()
  411     new_requests += all_requests
  412     return new_requests