summary refs log tree commit diff stats
path: root/htformtool.py
diff options
context:
space:
mode:
Diffstat (limited to 'htformtool.py')
-rwxr-xr-xhtformtool.py493
1 files changed, 493 insertions, 0 deletions
diff --git a/htformtool.py b/htformtool.py
new file mode 100755
index 0000000..218daa7
--- /dev/null
+++ b/htformtool.py
@@ -0,0 +1,493 @@
+#!/usr/bin/env python3
+
+import requests
+from bs4 import BeautifulSoup
+import click
+
+from enum import Enum
+
+VERSION = '0.1.0'
+
+sess = requests.Session()
+base_headers = {
+        # request (x)html form
+        'Accept': 'text/html,application/xhtml+xml',
+        'User-Agent': 'htformtool/{version}'.format(version=VERSION),
+        }
+# 
+post_headers = {
+        # request confirmation code
+        'Accept': 'text/plain',
+        }
+
+def hide_ua(ctx, param, value):
+    if not value or ctx.resilient_parsing:
+        return
+    base_headers['User-Agent'] = None
+
+def split_on_ascii_whitespace(inp):
+    start_position = 0
+    end_position = 0
+    tokens = []
+    while start_position < len(inp) and inp[start_position] in '\x09\x0A\x0C\x0D\x20':
+        start_position = start_position + 1
+    while start_position < len(inp):
+        end_position = start_position
+        while end_position < len(inp) and inp[end_position] not in '\x09\x0A\x0C\x0D\x20':
+            end_position = end_position + 1
+        tokens.append(inp[start_position:end_position])
+        start_position = end_position
+        while start_position < len(inp) and inp[start_position] in '\x09\x0A\x0C\x0D\x20':
+            start_position = start_position + 1
+    return tokens
+
+def ascii_lowercase(s):
+    import string
+    return s.translate(str.maketrans(string.ascii_uppercase, string.ascii_lowercase))
+
+def get_encoding(label):
+    # fuck
+    if ascii_lowercase(label) not in ('unicode-1-1-utf-8', 'utf-8', 'utf8'):
+        raise NotImplementedError
+    import codecs
+    return codecs.lookup('utf-8')
+
+import re
+newline_normalize = re.compile('\x0D(?!\x0A)|(?<!\x0D)\x0A')
+
+def append_an_entry(l, name, value, no_line_break_normalization=False):
+    # TODO might not be *strictly* correct
+    name = newline_normalize.sub('\r\n', name)
+    try:
+        if not no_line_break_normalization:
+            value = newline_normalize.sub('\r\n', value)
+    except ValueError:
+        pass
+    l.append((name, value))
+
+class FieldState(Enum):
+    # normal <input> types
+    HIDDEN = 'hidden'
+    TEXT = 'text'
+    SEARCH = 'search'
+    TELEPHONE = 'tel'
+    URL = 'url'
+    EMAIL = 'email'
+    PASSWORD = 'password'
+    DATE = 'date'
+    MONTH = 'month'
+    WEEK = 'week'
+    TIME = 'time'
+    LOCAL_DATE_AND_TIME = 'datetime-local'
+    NUMBER = 'number'
+    RANGE = 'range'
+    COLOR = 'color'
+    CHECKBOX = 'checkbox'
+    RADIO = 'radio'
+    FILE = 'file'
+    SUBMIT = 'submit'
+    IMAGE = 'image'
+    RESET = 'reset'
+    BUTTON = 'button'
+
+    # custom, htformtool-specific <input> types
+    CREDENTIALS = 'credentials'
+
+    # non-<input> types
+    TEXTAREA = 'textarea'
+    SELECT = 'select'
+
+    # <button> types
+    BSUBMIT = 'bsubmit'
+    BRESET = 'breset'
+    BBUTTON = 'bbutton'
+
+    def is_button(self, submitter=None):
+        if self in (FieldState.BSUBMIT, FieldState.IMAGE, FieldState.SUBMIT):
+            return submitter is None or submitter == True
+        if self in (FieldState.BRESET, FieldState.BBUTTON, FieldState.RESET, FieldState.BUTTON):
+            return submitter is None or submitter == False
+        return False
+
+    def blocks_implicit_submission(self):
+        return self in (FieldState.TEXT, FieldState.SEARCH, FieldState.URL, FieldState.TELEPHONE,
+                FieldState.EMAIL, FieldState.PASSWORD, FieldState.DATE, FieldState.MONTH,
+                FieldState.WEEK, FieldState.TIME, FieldState.LOCAL_DATE_AND_TIME, FieldState.NUMBER)
+
+class ConstraintError(ValueError):
+    pass
+
+class FormData:
+    """
+    Represents the data to be submitted by the form.
+    """
+
+    def __init__(self, encoding, entry_list, action, enctype, method, target):
+        self.encoding = encoding
+        """The codec object that should be used to encode the form for sending"""
+        self.entry_list = entry_list
+        """The entry list"""
+        self.action = action
+        """The form's raw action (URL) (not parsed)"""
+        self.enctype = enctype
+        """The form's enctype"""
+        self.method = method
+        """The form's method (not sanitized)"""
+        self.target = target
+        """The form's target (not sanitized)"""
+
+class Form:
+    def __init__(self, form):
+        self.form = form
+        # these have the same length
+        self.elements = []
+        self.fields = []
+
+    def submit(self, document_encoding, submitter=None):
+        """
+        Submits the form implicitly, or with the given submitter.
+
+        Raises ValueError if the given submitter isn't a valid submitter.
+
+        Raises ConstraintError if this field's no-validate state is false and one or more of the form's fields is invalid. (note: ConstraintError is a subtype of ValueError)
+
+        Returns a FormData object, or None if implicit submission is not allowed.
+        """
+        if submitter is not None:
+            if not submitter in self.fields:
+                raise ValueError
+            if not submitter.is_button(submitter=True):
+                raise ValueError
+            if not submitter.no_validate():
+                for field in self.fields:
+                    field.check_value()
+        elif not self.form.get('novalidate'):
+            blocks_implicit_submission = 0
+            for field in self.fields:
+                field.check_value()
+                if submitter is None:
+                    if field.is_button(submitter=True):
+                        blocks_implicit_submission = 0
+                        submitter = field
+                    elif field._blocks_implicit_submission():
+                        blocks_implicit_submission += 1
+            if blocks_implicit_submission > 1:
+                return None
+
+        encoding = document_encoding
+        if self.form.get('accept-charset') is not None:
+            candidate_enc_labels = split_on_ascii_whitespace(self.form['accept-charset'])
+            candidate_enc = []
+            for token in candidate_enc_labels:
+                enc = get_encoding(token)
+                if enc is not None:
+                    candidate_enc.append(enc)
+            if not candidate_enc:
+                encoding = get_encoding('utf-8')
+            else:
+                encoding = candidate_enc[0]
+
+        controls = self.fields
+        entry_list = []
+        for field in controls:
+            if field.is_button() and field is not submitter:
+                continue
+            if field.is_checkable() and not field.is_checked():
+                continue
+            if field.is_image_button():
+                name = field.field['name'] + '.' if field.field.get('name') else ''
+                namex = name + 'x'
+                namey = name + 'y'
+                append_an_entry(entry_list, namex, 0)
+                append_an_entry(entry_list, namey, 0)
+                continue
+            name = field.field['name']
+            if field.is_select():
+                for option in field.get_options():
+                    raise NotImplementedError
+            elif field.is_checkable():
+                append_an_entry(entry_list, name, field.get_value())
+            elif field.is_file():
+                raise NotImplementedError
+            elif field.is_hidden() and name == '_charset_':
+                raise NotImplementedError
+            elif field.is_textarea():
+                raise NotImplementedError
+            else:
+                append_an_entry(name, field.get_value())
+            if field.has_valid_dirname():
+                dirname = field.field['dirname']
+                raise NotImplementedError
+
+        action = None
+        if submitter is not None and submitter.field.get('formaction') is not None:
+            action = submitter.field['formaction']
+        if action is None and self.form.get('action'):
+            action = self.form['action']
+        if action is None:
+            action = ''
+
+        enctype = None
+        if submitter is not None and submitter.field.get('formenctype') is not None:
+            enctype = submitter.field['formenctype']
+        if enctype is None and self.form.get('enctype'):
+            enctype = self.form['enctype']
+        enctype = ascii_lowercase(enctype)
+        if enctype not in ('application/x-www-form-urlencoded', 'multipart/form-data', 'text/plain'):
+            enctype = 'application/x-www-form-urlencoded'
+
+        method = None
+        if submitter is not None and submitter.field.get('formmethod') is not None:
+            method = submitter.field['formmethod']
+        if method is None and self.form.get('method'):
+            method = self.form['method']
+        method = ascii_lowercase(method)
+        if method not in ('get', 'post', 'dialog'):
+            method = 'get'
+
+        # WARNING: NOT SANITIZED
+        target = None
+        if submitter is not None and submitter.field.get('formtarget') is not None:
+            target = submitter.field['formtarget']
+        else:
+            if self.form.get('target') is not None:
+                target = self.form['target']
+            elif self.form.find_parent('[document]').base is not None and self.form.find_parent('[document]').base.get('target') is not None:
+                target = self.form.find_parent('[document]').base['target'] 
+            else:
+                target = ''
+
+        return FormData(encoding, entry_list, action, enctype, method, target)
+
+class FormField:
+    def __init__(self, form, field):
+        self.field = field
+        self.form = form
+        form.fields.append(self)
+
+        if field.name == 'input':
+            ftype = ascii_lowercase(field.get('type', 'text'))
+            if ftype not in ('textarea', 'selection', 'bbutton', 'bsubmit', 'breset'):
+                try:
+                    state = FieldState(ftype)
+                except ValueError:
+                    state = FieldState.TEXT
+            else:
+                state = FieldState.TEXT
+        elif field.name == 'button':
+            ftype = ascii_lowercase(field.get('type', 'submit'))
+            if ftype == 'reset':
+                state = FieldState.BRESET
+            elif ftype == 'button':
+                state = FieldState.BBUTTON
+            else:
+                state = FieldState.BSUBMIT
+        elif field.name == 'textarea':
+            state = FieldState.TEXTAREA
+        elif field.name == 'select':
+            state = FieldName.SELECT
+        self.state = state
+
+        self.reset()
+
+    def reset(self):
+        if self.is_checkable():
+            self._checked = self.field.get('checked') is not None
+        if self.is_select():
+            self.options = []
+            raise NotImplementedError
+        if self._is_text_based():
+            self.text = self.field.get('value') or ''
+
+    def _is_text_based(self):
+        raise NotImplementedError
+
+    def is_button(self, submitter=None):
+        self.state.is_button(submitter=submitter)
+
+    def blocks_implicit_submission(self):
+        self.state.blocks_implicit_submission()
+
+    def is_textarea(self):
+        return self.state == FieldState.TEXTAREA
+
+    def is_image_button(self):
+        return self.state == FieldState.IMAGE
+
+    def is_radio(self, name=None):
+        return self.state == FieldState.RADIO and (name is None or self.field.get('name') == name)
+
+    def is_file(self):
+        return self.state == FieldState.FILE
+
+    def is_checkable(self):
+        return self.state in (FieldState.CHECKBOX, FieldState.RADIO)
+
+    def is_checked(self):
+        if not self.is_checkable():
+            raise ValueError
+        return self._checked
+
+    def is_select(self):
+        return self.field.name == 'select'
+
+    def has_valid_dirname(self):
+        raise NotImplementedError
+        #return (self.is_textarea() or (self.field.name == 'input' and self.field.get('type') not in NOT_TEXT)) and bool(self.field.get('dirname'))
+
+    def get_options(self, selected=None, disabled=False):
+        if not self.is_select():
+            raise ValueError
+        raise NotImplementedError
+
+    def set_checked(self, checkedness=True):
+        if not self.is_checkable():
+            raise ValueError
+        if self.is_radio():
+            if not checkedness:
+                raise ValueError # you can't really un-check a radio
+            for field in self.form.fields:
+                if field.is_radio(name=self.field['name']) and field.is_checked():
+                    field._checked = False
+        self._checked = checkedness
+
+    def set_value(self, value):
+        raise NotImplementedError
+
+    def get_value(self):
+        if self.is_checkable():
+            return self.field.get('value', default='on')
+        raise NotImplementedError
+
+    def check_value(self):
+        """
+        Checks if the value in this field satisfies the constraints.
+
+        Raises ConstraintError if it doesn't.
+        """
+        raise NotImplementedError
+
+    def no_validate(self):
+        """
+        Returns this element's no-validate state.
+
+        Raises ValueError if this element is not a submitter button.
+        """
+        if not self.is_button(submitter=True):
+            raise ValueError
+        if self.field.get('formnovalidate') is not None:
+            return True
+        if self.form.form.get('novalidate') is not None:
+            return True
+        return False
+
+    def submit(self, document_encoding):
+        """
+        Submits the form with this field as submitter.
+
+        Raises ValueError if this field isn't a valid submitter.
+
+        Raises ConstraintError if this field's no-validate state is false and one or more of the form's fields is invalid. (note: ConstraintError is a subtype of ValueError)
+
+        Returns the form data in the correct serialization format, as specified by the form element.
+        """
+        self.form.submit(document_encoding=document_encoding, submitter=self)
+
+def extract_forms(document):
+    forms = [Form(form) for form in document.find_all('form')]
+    forms_with_id = {form.form['id']: form for form in forms if form.form.get('id') is not None}
+    forms_by_object = {form.form: form for form in forms}
+    for element in document.find_all(['button', 'input', 'select', 'textarea']):
+        element_form = element.get('form')
+        if element_form is not None:
+            element_form = forms_with_id.get(element_form)
+        if element_form is None:
+            element_form = element.find_parent('form')
+            if element_form is not None:
+                element_form = forms_by_object[element_form]
+            #element_form = element.parent
+            #while element_form is not None:
+            #    if element_form.name == 'form':
+            #        element_form = forms_by_object[element_form]
+            #        break
+        if element_form is not None:
+            element_form.elements.append(element)
+    # we don't need these anymore
+    del forms_with_id
+    del forms_by_object
+    
+    # disabled controls
+    def should_keep(field):
+        def is_disabled(field):
+            if field.get('disabled') is not None:
+                return True
+            for parent in field.find_parents('fieldset'):
+                if parent.get('disabled') is not None:
+                    if parent.legend is not None and parent.legend in field.parents:
+                        return False
+                    else:
+                        return True
+            return False
+
+        if is_disabled(field):
+            return False
+        if (not (field.name == 'input' and field.get('type') == 'image')) and not field.get('name'):
+            return False
+        # weird edge-case
+        if field.get('name') == 'isindex':
+            return False
+        return True
+
+    # clean up
+    for form in forms:
+        # TODO clean this up
+        newelements = [FormField(form, field) for field in form.elements if should_keep(field)]
+        form.elements = [ff.field for ff in newelements]
+
+    return forms
+
+@click.command()
+@click.option('--encoding', default='', help='Overrides the character encoding of the document.', metavar='ENCODING')
+@click.option('--fallback-encoding', default='utf-8', help='Sets the encoding used if the encoding can\'t be determined by another means. Ignored if --encoding is used.', show_default=True, metavar='ENCODING')
+@click.option('--hide-ua', is_flag=True, help='Prevent sending the User-Agent string.', expose_value=False, is_eager=True, callback=hide_ua)
+@click.version_option(version=VERSION, prog_name="htformtool")
+@click.argument('url')
+def htformtool(encoding, fallback_encoding, url):
+    sess.headers.update(base_headers)
+    sess.headers.update({'Accept-Charset': encoding or fallback_encoding})
+    r = sess.get(url)
+    try:
+        if not click.confirm("Using {}, continue?".format(r.url), default=True):
+            return
+    except click.Abort:
+        click.echo("") # just a newline
+        return
+    try:
+        if r.headers['Content-Type'].startswith('text/html'):
+            isXML = False
+        elif r.headers['Content-Type'].startswith('application/xhtml+xml'):
+            isXML = True
+        else:
+            # not an acceptable content type, ignore it.
+            click.echo("No HTML found.")
+            return
+    except IndexError:
+        # no content-type, just assume HTML. it's probably good enough for our needs.
+        isXML = None
+    if encoding:
+        r.encoding = encoding
+    if r.encoding is None:
+        # FIXME
+        r.encoding = fallback_encoding
+    soup = BeautifulSoup(r.content, "xml" if isXML else "html5lib")
+    forms = extract_forms(soup)
+    if not forms:
+        click.echo("No forms found.")
+    else:
+        click.echo("There are {} forms:".format(len(forms)))
+        # TODO
+        click.echo(forms)
+
+
+if __name__ == '__main__':
+    htformtool()