diff options
Diffstat (limited to 'ganarchy/data.py')
-rw-r--r-- | ganarchy/data.py | 585 |
1 files changed, 585 insertions, 0 deletions
diff --git a/ganarchy/data.py b/ganarchy/data.py new file mode 100644 index 0000000..bc5a1b6 --- /dev/null +++ b/ganarchy/data.py @@ -0,0 +1,585 @@ +# This file is part of GAnarchy - decentralized project hub +# Copyright (C) 2019 Soni L. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""This module handles GAnarchy's data and config sources. + +A data source can be either a config source or a repo list source, but be +careful: they use identical syntax, but have different semantics! Mistaking +a repo list source for a config source is a recipe for security bugs! +""" + +import abc +import itertools +import os +import re +import time + +import abdl +import abdl.exceptions +import qtoml +import requests + +from enum import Enum +from urllib.parse import urlparse + +# TODO move elsewhere +class URIPredicate(abdl.predicates.Predicate): + def __init__(self, ports=range(1,65536), schemes=('https',)): + self.ports = ports + self.schemes = schemes + + def accept(self, obj): + try: + u = urlparse(obj) + if not u: + return False + # also raises for invalid ports, see https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlparse + # "Reading the port attribute will raise a ValueError if an invalid port is specified in the URL. [...]" + if u.port is not None and u.port not in self.ports: + return False + if u.scheme not in self.schemes: + return False + except ValueError: + return False + return True + +class CommitPredicate(abdl.predicates.Predicate): + def __init__(self, sha256ready=True): + if sha256ready: + self.re = re.compile(r"^[0-9a-fA-F]{40}$|^[0-9a-fA-F]{64}$") + else: + self.re = re.compile(r"^[0-9a-fA-F]{40}$") + + def accept(self, obj): + return self.re.match(obj) + +# sanitize = skip invalid entries +# validate = error on invalid entries +# LEGACY. DO NOT USE. +CONFIG_REPOS_SANITIZE = abdl.compile("""->'projects'?:?$dict + ->commit[:?$commit]:?$dict + ->url[:?$str:?$uri]:?$dict + ->branch:?$dict(->'active'?:?$bool)""", + dict(bool=bool, dict=dict, str=str, uri=URIPredicate(), commit=CommitPredicate())) + +CONFIG_TITLE_SANITIZE = abdl.compile("""->title'title'?:?$str""", dict(str=str)) +CONFIG_BASE_URL_SANITIZE = abdl.compile("""->base_url'base_url'?:?$str:?$uri""", dict(str=str, uri=URIPredicate())) + +# modern matchers, raise ValidationError if the data doesn't exist. +# they still skip "bad" entries, just like the old matchers. + +_MATCHER_REPOS = abdl.compile("""->'projects':$dict + ->commit[:?$commit]:?$dict + ->url[:?$str:?$uri]:?$dict + ->branch:?$dict(->'active'?:?$bool)""", + dict(bool=bool, dict=dict, str=str, uri=URIPredicate(), commit=CommitPredicate())) +_MATCHER_REPO_LIST_SRCS = abdl.compile("""->'repo_list_srcs':$dict + ->src[:?$str:?$uri]:?$dict + (->'active'?:?$bool)""", + dict(bool=bool, list=list, dict=dict, str=str, uri=URIPredicate(schemes=('https','file',)))) +# TODO +#_MATCHER_ALIASES = abdl.compile("""->'project_settings':$dict +# ->commit/[0-9a-fA-F]{40}|[0-9a-fA-F]{64}/?:?$dict +# """, {'dict': dict}) # FIXME check for aliases, might require changes to abdl + +_MATCHER_TITLE = abdl.compile("""->title'title':$str""", dict(str=str)) +_MATCHER_BASE_URL = abdl.compile("""->base_url'base_url':$str:$uri""", dict(str=str, uri=URIPredicate())) + +class OverridableProperty(abc.ABC): + """An overridable property, with options. + + Attributes: + options (dict): Options. + """ + + @abc.abstractmethod + def as_key(self): + """Returns an opaque representation of this OverridablePRoperty + suitable for use as a dict key. + + The returned object is not suitable for other purposes. + """ + return () + + @property + def active(self): + """Whether this property is active. + """ + return self.options.get('active', False) + +class PCTP(OverridableProperty): + """A Project Commit-Tree Path. + + Attributes: + project_commit (str): The project commit. + uri (str): The URI of a fork of the project. + branch (str): The branch name, or "HEAD" for the default branch. + options (dict): A dict of fork-specific options. + """ + + def __init__(self, project_commit, uri, branch, options): + self.project_commit = project_commit + self.uri = uri + self.branch = branch + self.options = options + + def as_key(self): + return (self.project_commit, self.uri, self.branch, ) + +class RepoListSource(OverridableProperty): + """A source for a repo list. + + Attributes: + uri (str): The URI of the repo list. + options (dict): A dict of repo list-specific options. + """ + + def __init__(self, uri, options): + self.uri = uri + self.options = options + + def as_key(self): + return (self.uri, ) + +class DataProperty(Enum): + """Represents values that can be returned by a data source. + + See documentation for DataSource get_property_value and + DataSource get_property_values for more details. + """ + INSTANCE_TITLE = (1, str) + INSTANCE_BASE_URL = (2, str) + VCS_REPOS = (3, PCTP) + REPO_LIST_SOURCES = (4, RepoListSource) + + def get_type(self): + """Returns the expected type for values from this DataProperty. + """ + return self.value[1] + +class PropertyError(LookupError): + """Raised to indicate improper use of a DataProperty. + """ + pass + +class DataSource(abc.ABC): + @abc.abstractmethod + def update(self): + """Refreshes the data associated with this source, if necessary. + """ + pass + + @abc.abstractmethod + def exists(self): + """Returns whether this source has usable data. + """ + pass + + @abc.abstractmethod + def get_supported_properties(self): + """Returns an iterable of properties supported by this data source. + + Returns: + Iterable of DataProperty: Supported properties. + + """ + return () + + def get_property_value(self, prop): + """Returns the value associated with the given property. + + If duplicated, an earlier value should override a later value. + + Args: + prop (DataProperty): The property. + + Returns: + The value associated with the given property. + + Raises: + PropertyError: If the property is not supported by this data + source. + LookupError: If the property is supported, but isn't available. + ValueError: If the property doesn't have exactly one value. + """ + iterator = self.get_property_values(prop) + try: + # note: unpacking + ret, = iterator + except LookupError as exc: raise RuntimeError from exc # don't accidentally swallow bugs in the iterator + return ret + + @abc.abstractmethod + def get_property_values(self, prop): + """Yields the values associated with the given property. + + If duplicated, earlier values should override later values. + + Args: + prop (DataProperty): The property. + + Yields: + The values associated with the given property. + + Raises: + PropertyError: If the property is not supported by this data + source. + LookupError: If the property is supported, but isn't available. + + """ + raise PropertyError + +class ObjectDataSource(DataSource): + """A DataSource backed by a Python object. + + Updates to the backing object will be immediately reflected in this + DataSource. + """ + _SUPPORTED_PROPERTIES = { + DataProperty.INSTANCE_TITLE: lambda obj: (d['title'][1] for d in _MATCHER_TITLE.match(obj)), + DataProperty.INSTANCE_BASE_URL: lambda obj: (d['base_url'][1] for d in _MATCHER_BASE_URL.match(obj)), + DataProperty.VCS_REPOS: lambda obj: (PCTP(r['commit'][0], r['url'][0], r['branch'][0], r['branch'][1]) for r in _MATCHER_REPOS.match(obj)), + DataProperty.REPO_LIST_SOURCES: lambda obj: (RepoListSource(d['src'][0], d['src'][1]) for d in _MATCHER_REPO_LIST_SRCS.match(obj)), + } + + def __init__(self, obj): + self._obj = obj + + def update(self): + pass + + def exists(self): + return True + + def get_property_values(self, prop): + try: + factory = self.get_supported_properties()[prop] + except KeyError as exc: raise PropertyError from exc + iterator = factory(self._obj) + try: + first = next(iterator) + except StopIteration: return (x for x in ()) + except abdl.exceptions.ValidationError as exc: raise LookupError from exc + except LookupError as exc: raise RuntimeError from exc # don't accidentally swallow bugs in the iterator + return itertools.chain([first], iterator) + + @classmethod + def get_supported_properties(cls): + return cls._SUPPORTED_PROPERTIES + +class LocalDataSource(ObjectDataSource): + def __init__(self, filename): + super().__init__({}) + self.file_exists = False + self.last_updated = None + self.filename = filename + + def update(self): + try: + updtime = self.last_updated + self.last_updated = os.stat(self.filename).st_mtime + if not self.file_exists or updtime != self.last_updated: + with open(self.filename) as f: + self._obj = qtoml.load(f) + self.file_exists = True + except (OSError, UnicodeDecodeError, qtoml.decoder.TOMLDecodeError) as e: + self.file_exists = False + self.last_updated = None + self._obj = {} + return e + + def exists(self): + return self.file_exists + + def __repr__(self): + return "LocalDataSource({!r})".format(self.filename) + +class RemoteDataSource(ObjectDataSource): + def __init__(self, uri): + super().__init__({}) + self.uri = uri + self.remote_exists = False + self.next_update = 0 + + def update(self): + if self.next_update > time.time(): + return + # I long for the day when toml has a registered media type + response = requests.get(self.uri, headers={'user-agent': 'ganarchy/0.0.0', 'accept': '*/*'}) + self.remote_exists = response.status_code == 200 + seconds = 3600 + if (refresh := response.headers.get('Refresh', None)) is not None: + try: + seconds = int(refresh) + except ValueError: + refresh = refresh.split(';', 1) + try: + seconds = int(refresh[0]) + except ValueError: + pass + self.next_update = time.time() + seconds + if self.remote_exists: + response.encoding = 'utf-8' + try: + self._obj = qtoml.loads(response.text) + except (UnicodeDecodeError, qtoml.decoder.TOMLDecodeError) as e: + self._obj = {} + return e + else: + return response + + def exists(self): + return self.remote_exists + + def __repr__(self): + return "RemoteDataSource({!r})".format(self.uri) + +class DefaultsDataSource(ObjectDataSource): + """Provides a way for contributors to define/encourage some default + settings. + + In particular, enables contributors to have a say in default domain + blocks. + """ + DEFAULTS = {} + + def __init__(self): + super().__init__(self.DEFAULTS) + + def exists(self): + return True + + def update(self): + return + + def __repr__(self): + return "DefaultsDataSource()" + + +class ConfigManager(DataSource): + """A ConfigManager takes care of managing config sources and + collecting their details. + + Args: + sources (list of DataSource): The config sources to be managed. + """ + def __init__(self, sources): + self.sources = sources + + @classmethod + def new_default(cls): + from ganarchy import config_home, config_dirs + srcs = [LocalDataSource(d + "/config.toml") for d in [config_home] + config_dirs] + return cls(srcs) + + def exists(self): + return True + + def update(self): + excs = [] + for source in self.sources: + excs.append(source.update()) + return excs + + def get_supported_properties(self): + return DataProperty + + def get_property_values(self, prop): + if prop not in self.get_supported_properties(): + raise PropertyError + elif prop == DataProperty.VCS_REPOS: + return self._get_vcs_repos() + elif prop == DataProperty.REPO_LIST_SOURCES: + return self._get_repo_list_sources() + else: + # short-circuiting, as these are only supposed to return a single value + for source in self.sources: + try: + return source.get_property_values(prop) + except PropertyError: + pass + except LookupError: + pass + raise LookupError + + def _get_vcs_repos(self): + for source in self.sources: + if DataProperty.VCS_REPOS in source.get_supported_properties(): + try: + iterator = source.get_property_values(DataProperty.VCS_REPOS) + except LookupError: + pass + else: + yield from iterator + + def _get_repo_list_sources(self): + for source in self.sources: + if DataProperty.REPO_LIST_SOURCES in source.get_supported_properties(): + try: + iterator = source.get_property_values(DataProperty.REPO_LIST_SOURCES) + except LookupError: + pass + else: + yield from iterator + +class RepoListManager(DataSource): + """A RepoListManager takes care of managing repo lists. + + Args: + config_manager (DataSource): The config manager from which the repo + lists come. + """ + def __init__(self, config_manager): + self.config_manager = EffectiveSource(config_manager) + self.sources = [self.config_manager] + + def exists(self): + return True + + def update(self): + excs = [self.config_manager.update()] + if DataProperty.REPO_LIST_SOURCES in self.config_manager.get_supported_properties(): + self.sources = [self.config_manager] + try: + it = self.config_manager.get_property_values(DataProperty.REPO_LIST_SOURCES) + except LookupError: + pass + else: + self.sources.extend(RemoteDataSource(rls.uri) for rls in it if rls.active) + for source in self.sources: + excs.append(source.update()) + return excs + + def get_supported_properties(self): + return {DataProperty.VCS_REPOS} + + def get_property_values(self, prop): + if prop not in self.get_supported_properties(): + raise PropertyError + assert prop == DataProperty.VCS_REPOS + # must raise exceptions *now* + # not when the generator runs + return self._get_vcs_repos(self.config_manager.get_property_values(DataProperty.VCS_REPOS)) + + def _get_vcs_repos(self, it): + assert self.config_manager == self.sources[0] + # config manager may override repo lists + yield from it + for source in self.sources: + if DataProperty.VCS_REPOS in source.get_supported_properties(): + try: + iterator = source.get_property_values(DataProperty.VCS_REPOS) + except LookupError: + pass + else: + for pctp in iterator: + # but repo lists aren't allowed to override anything + if pctp.active: + yield pctp + +class EffectiveSource(DataSource): + """Wraps another ``DataSource`` and yields "unique" results suitable + for general use. + + Methods on this class, in particular ``get_property_values``, handle + ``OverridableProperty`` overrides both to avoid code duplication and + so the user doesn't have to. + + Args: + raw_source (DataSource): The raw backing source. + """ + def __init__(self, raw_source): + self.raw_source = raw_source + + def exists(self): + return self.raw_source.exists() + + def update(self): + return self.raw_source.update() + + def get_property_value(self, prop): + return self.raw_source.get_property_value(prop) + + def get_supported_properties(self): + return self.raw_source.get_supported_properties() + + def get_property_values(self, prop): + # must raise exceptions *now* + # not when the generator runs + return self._wrap_values(prop, self.raw_source.get_property_values(prop)) + + def _wrap_values(self, prop, it): + if issubclass(prop.get_type(), OverridableProperty): + seen = {} + for v in it: + k = v.as_key() + if k in seen: + continue + seen[k] = v + yield v + else: + yield from it + + def __repr__(self): + return "EffectiveSource({!r})".format(self.raw_source) + +# class Config: +# def __init__(self, toml_file, base=None, remove=True): +# self.projects = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))) +# config_data = qtoml.load(toml_file) +# self.remote_configs = config_data.get('config_srcs', []) +# self.title = config_data.get('title', '') +# self.base_url = config_data.get('base_url', '') +# # TODO blocked domains (but only read them from config_data if remove is True) +# self.blocked_domains = [] +# self.blocked_domain_suffixes = [] +# self.blocked_domains.sort() +# self.blocked_domain_suffixes.sort(key=lambda x: x[::-1]) +# # FIXME remove duplicates and process invalid entries +# self.blocked_domains = tuple(self.blocked_domains) +# self.blocked_domain_suffixes = tuple(self.blocked_domain_suffixes) # MUST be tuple +# # TODO re.compile("(^" + "|^".join(map(re.escape, domains)) + "|" + "|".join(map(re.escape, suffixes) + ")$") +# if base: +# # FIXME is remove=remove the right thing to do? +# self._update_projects(base.projects, remove=remove, sanitize=False) # already sanitized +# projects = config_data.get('projects', {}) +# self._update_projects(projects, remove=remove) +# +# def _update_projects(self, projects, remove, sanitize=True): +# m = (m_ganarchy_config.CONFIG_PATTERN_SANITIZE if sanitize else m_ganarchy_config.CONFIG_PATTERN).match(projects) +# for v in m: +# commit, repo_url, branchname, options = v['commit'][0], v['url'][0], v['branch'][0], v['branch'][1] +# try: +# u = urlparse(repo_url) +# if not u: +# raise ValueError +# # also raises for invalid ports, see https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlparse +# # "Reading the port attribute will raise a ValueError if an invalid port is specified in the URL. [...]" +# if u.port == 0: +# raise ValueError +# if u.scheme not in ('http', 'https'): +# raise ValueError +# if (u.hostname in self.blocked_domains) or (u.hostname.endswith(self.blocked_domain_suffixes)): +# raise ValueError +# except ValueError: +# continue +# if branchname == "HEAD": +# branchname = None +# active = options.get('active', None) +# if active not in (True, False): +# continue +# branch = self.projects[commit][repo_url][branchname] +# branch['active'] = active or (branch.get('active', False) and not remove) |