diff options
author | SoniEx2 <endermoneymod@gmail.com> | 2019-02-01 23:01:31 -0200 |
---|---|---|
committer | SoniEx2 <endermoneymod@gmail.com> | 2019-02-01 23:01:31 -0200 |
commit | aa84238845256d484e27b62643ab3d59563e5195 (patch) | |
tree | ef337d3efb011f26a22c984822648a302473243e /htformtool.py | |
parent | 713d7e8d1da8df1178fb4af4cd77cffff111ce41 (diff) |
I give up for now, might come back to this later HEAD htformtool
Diffstat (limited to 'htformtool.py')
-rwxr-xr-x | htformtool.py | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/htformtool.py b/htformtool.py new file mode 100755 index 0000000..218daa7 --- /dev/null +++ b/htformtool.py @@ -0,0 +1,493 @@ +#!/usr/bin/env python3 + +import requests +from bs4 import BeautifulSoup +import click + +from enum import Enum + +VERSION = '0.1.0' + +sess = requests.Session() +base_headers = { + # request (x)html form + 'Accept': 'text/html,application/xhtml+xml', + 'User-Agent': 'htformtool/{version}'.format(version=VERSION), + } +# +post_headers = { + # request confirmation code + 'Accept': 'text/plain', + } + +def hide_ua(ctx, param, value): + if not value or ctx.resilient_parsing: + return + base_headers['User-Agent'] = None + +def split_on_ascii_whitespace(inp): + start_position = 0 + end_position = 0 + tokens = [] + while start_position < len(inp) and inp[start_position] in '\x09\x0A\x0C\x0D\x20': + start_position = start_position + 1 + while start_position < len(inp): + end_position = start_position + while end_position < len(inp) and inp[end_position] not in '\x09\x0A\x0C\x0D\x20': + end_position = end_position + 1 + tokens.append(inp[start_position:end_position]) + start_position = end_position + while start_position < len(inp) and inp[start_position] in '\x09\x0A\x0C\x0D\x20': + start_position = start_position + 1 + return tokens + +def ascii_lowercase(s): + import string + return s.translate(str.maketrans(string.ascii_uppercase, string.ascii_lowercase)) + +def get_encoding(label): + # fuck + if ascii_lowercase(label) not in ('unicode-1-1-utf-8', 'utf-8', 'utf8'): + raise NotImplementedError + import codecs + return codecs.lookup('utf-8') + +import re +newline_normalize = re.compile('\x0D(?!\x0A)|(?<!\x0D)\x0A') + +def append_an_entry(l, name, value, no_line_break_normalization=False): + # TODO might not be *strictly* correct + name = newline_normalize.sub('\r\n', name) + try: + if not no_line_break_normalization: + value = newline_normalize.sub('\r\n', value) + except ValueError: + pass + l.append((name, value)) + +class FieldState(Enum): + # normal <input> types + HIDDEN = 'hidden' + TEXT = 'text' + SEARCH = 'search' + TELEPHONE = 'tel' + URL = 'url' + EMAIL = 'email' + PASSWORD = 'password' + DATE = 'date' + MONTH = 'month' + WEEK = 'week' + TIME = 'time' + LOCAL_DATE_AND_TIME = 'datetime-local' + NUMBER = 'number' + RANGE = 'range' + COLOR = 'color' + CHECKBOX = 'checkbox' + RADIO = 'radio' + FILE = 'file' + SUBMIT = 'submit' + IMAGE = 'image' + RESET = 'reset' + BUTTON = 'button' + + # custom, htformtool-specific <input> types + CREDENTIALS = 'credentials' + + # non-<input> types + TEXTAREA = 'textarea' + SELECT = 'select' + + # <button> types + BSUBMIT = 'bsubmit' + BRESET = 'breset' + BBUTTON = 'bbutton' + + def is_button(self, submitter=None): + if self in (FieldState.BSUBMIT, FieldState.IMAGE, FieldState.SUBMIT): + return submitter is None or submitter == True + if self in (FieldState.BRESET, FieldState.BBUTTON, FieldState.RESET, FieldState.BUTTON): + return submitter is None or submitter == False + return False + + def blocks_implicit_submission(self): + return self in (FieldState.TEXT, FieldState.SEARCH, FieldState.URL, FieldState.TELEPHONE, + FieldState.EMAIL, FieldState.PASSWORD, FieldState.DATE, FieldState.MONTH, + FieldState.WEEK, FieldState.TIME, FieldState.LOCAL_DATE_AND_TIME, FieldState.NUMBER) + +class ConstraintError(ValueError): + pass + +class FormData: + """ + Represents the data to be submitted by the form. + """ + + def __init__(self, encoding, entry_list, action, enctype, method, target): + self.encoding = encoding + """The codec object that should be used to encode the form for sending""" + self.entry_list = entry_list + """The entry list""" + self.action = action + """The form's raw action (URL) (not parsed)""" + self.enctype = enctype + """The form's enctype""" + self.method = method + """The form's method (not sanitized)""" + self.target = target + """The form's target (not sanitized)""" + +class Form: + def __init__(self, form): + self.form = form + # these have the same length + self.elements = [] + self.fields = [] + + def submit(self, document_encoding, submitter=None): + """ + Submits the form implicitly, or with the given submitter. + + Raises ValueError if the given submitter isn't a valid submitter. + + Raises ConstraintError if this field's no-validate state is false and one or more of the form's fields is invalid. (note: ConstraintError is a subtype of ValueError) + + Returns a FormData object, or None if implicit submission is not allowed. + """ + if submitter is not None: + if not submitter in self.fields: + raise ValueError + if not submitter.is_button(submitter=True): + raise ValueError + if not submitter.no_validate(): + for field in self.fields: + field.check_value() + elif not self.form.get('novalidate'): + blocks_implicit_submission = 0 + for field in self.fields: + field.check_value() + if submitter is None: + if field.is_button(submitter=True): + blocks_implicit_submission = 0 + submitter = field + elif field._blocks_implicit_submission(): + blocks_implicit_submission += 1 + if blocks_implicit_submission > 1: + return None + + encoding = document_encoding + if self.form.get('accept-charset') is not None: + candidate_enc_labels = split_on_ascii_whitespace(self.form['accept-charset']) + candidate_enc = [] + for token in candidate_enc_labels: + enc = get_encoding(token) + if enc is not None: + candidate_enc.append(enc) + if not candidate_enc: + encoding = get_encoding('utf-8') + else: + encoding = candidate_enc[0] + + controls = self.fields + entry_list = [] + for field in controls: + if field.is_button() and field is not submitter: + continue + if field.is_checkable() and not field.is_checked(): + continue + if field.is_image_button(): + name = field.field['name'] + '.' if field.field.get('name') else '' + namex = name + 'x' + namey = name + 'y' + append_an_entry(entry_list, namex, 0) + append_an_entry(entry_list, namey, 0) + continue + name = field.field['name'] + if field.is_select(): + for option in field.get_options(): + raise NotImplementedError + elif field.is_checkable(): + append_an_entry(entry_list, name, field.get_value()) + elif field.is_file(): + raise NotImplementedError + elif field.is_hidden() and name == '_charset_': + raise NotImplementedError + elif field.is_textarea(): + raise NotImplementedError + else: + append_an_entry(name, field.get_value()) + if field.has_valid_dirname(): + dirname = field.field['dirname'] + raise NotImplementedError + + action = None + if submitter is not None and submitter.field.get('formaction') is not None: + action = submitter.field['formaction'] + if action is None and self.form.get('action'): + action = self.form['action'] + if action is None: + action = '' + + enctype = None + if submitter is not None and submitter.field.get('formenctype') is not None: + enctype = submitter.field['formenctype'] + if enctype is None and self.form.get('enctype'): + enctype = self.form['enctype'] + enctype = ascii_lowercase(enctype) + if enctype not in ('application/x-www-form-urlencoded', 'multipart/form-data', 'text/plain'): + enctype = 'application/x-www-form-urlencoded' + + method = None + if submitter is not None and submitter.field.get('formmethod') is not None: + method = submitter.field['formmethod'] + if method is None and self.form.get('method'): + method = self.form['method'] + method = ascii_lowercase(method) + if method not in ('get', 'post', 'dialog'): + method = 'get' + + # WARNING: NOT SANITIZED + target = None + if submitter is not None and submitter.field.get('formtarget') is not None: + target = submitter.field['formtarget'] + else: + if self.form.get('target') is not None: + target = self.form['target'] + elif self.form.find_parent('[document]').base is not None and self.form.find_parent('[document]').base.get('target') is not None: + target = self.form.find_parent('[document]').base['target'] + else: + target = '' + + return FormData(encoding, entry_list, action, enctype, method, target) + +class FormField: + def __init__(self, form, field): + self.field = field + self.form = form + form.fields.append(self) + + if field.name == 'input': + ftype = ascii_lowercase(field.get('type', 'text')) + if ftype not in ('textarea', 'selection', 'bbutton', 'bsubmit', 'breset'): + try: + state = FieldState(ftype) + except ValueError: + state = FieldState.TEXT + else: + state = FieldState.TEXT + elif field.name == 'button': + ftype = ascii_lowercase(field.get('type', 'submit')) + if ftype == 'reset': + state = FieldState.BRESET + elif ftype == 'button': + state = FieldState.BBUTTON + else: + state = FieldState.BSUBMIT + elif field.name == 'textarea': + state = FieldState.TEXTAREA + elif field.name == 'select': + state = FieldName.SELECT + self.state = state + + self.reset() + + def reset(self): + if self.is_checkable(): + self._checked = self.field.get('checked') is not None + if self.is_select(): + self.options = [] + raise NotImplementedError + if self._is_text_based(): + self.text = self.field.get('value') or '' + + def _is_text_based(self): + raise NotImplementedError + + def is_button(self, submitter=None): + self.state.is_button(submitter=submitter) + + def blocks_implicit_submission(self): + self.state.blocks_implicit_submission() + + def is_textarea(self): + return self.state == FieldState.TEXTAREA + + def is_image_button(self): + return self.state == FieldState.IMAGE + + def is_radio(self, name=None): + return self.state == FieldState.RADIO and (name is None or self.field.get('name') == name) + + def is_file(self): + return self.state == FieldState.FILE + + def is_checkable(self): + return self.state in (FieldState.CHECKBOX, FieldState.RADIO) + + def is_checked(self): + if not self.is_checkable(): + raise ValueError + return self._checked + + def is_select(self): + return self.field.name == 'select' + + def has_valid_dirname(self): + raise NotImplementedError + #return (self.is_textarea() or (self.field.name == 'input' and self.field.get('type') not in NOT_TEXT)) and bool(self.field.get('dirname')) + + def get_options(self, selected=None, disabled=False): + if not self.is_select(): + raise ValueError + raise NotImplementedError + + def set_checked(self, checkedness=True): + if not self.is_checkable(): + raise ValueError + if self.is_radio(): + if not checkedness: + raise ValueError # you can't really un-check a radio + for field in self.form.fields: + if field.is_radio(name=self.field['name']) and field.is_checked(): + field._checked = False + self._checked = checkedness + + def set_value(self, value): + raise NotImplementedError + + def get_value(self): + if self.is_checkable(): + return self.field.get('value', default='on') + raise NotImplementedError + + def check_value(self): + """ + Checks if the value in this field satisfies the constraints. + + Raises ConstraintError if it doesn't. + """ + raise NotImplementedError + + def no_validate(self): + """ + Returns this element's no-validate state. + + Raises ValueError if this element is not a submitter button. + """ + if not self.is_button(submitter=True): + raise ValueError + if self.field.get('formnovalidate') is not None: + return True + if self.form.form.get('novalidate') is not None: + return True + return False + + def submit(self, document_encoding): + """ + Submits the form with this field as submitter. + + Raises ValueError if this field isn't a valid submitter. + + Raises ConstraintError if this field's no-validate state is false and one or more of the form's fields is invalid. (note: ConstraintError is a subtype of ValueError) + + Returns the form data in the correct serialization format, as specified by the form element. + """ + self.form.submit(document_encoding=document_encoding, submitter=self) + +def extract_forms(document): + forms = [Form(form) for form in document.find_all('form')] + forms_with_id = {form.form['id']: form for form in forms if form.form.get('id') is not None} + forms_by_object = {form.form: form for form in forms} + for element in document.find_all(['button', 'input', 'select', 'textarea']): + element_form = element.get('form') + if element_form is not None: + element_form = forms_with_id.get(element_form) + if element_form is None: + element_form = element.find_parent('form') + if element_form is not None: + element_form = forms_by_object[element_form] + #element_form = element.parent + #while element_form is not None: + # if element_form.name == 'form': + # element_form = forms_by_object[element_form] + # break + if element_form is not None: + element_form.elements.append(element) + # we don't need these anymore + del forms_with_id + del forms_by_object + + # disabled controls + def should_keep(field): + def is_disabled(field): + if field.get('disabled') is not None: + return True + for parent in field.find_parents('fieldset'): + if parent.get('disabled') is not None: + if parent.legend is not None and parent.legend in field.parents: + return False + else: + return True + return False + + if is_disabled(field): + return False + if (not (field.name == 'input' and field.get('type') == 'image')) and not field.get('name'): + return False + # weird edge-case + if field.get('name') == 'isindex': + return False + return True + + # clean up + for form in forms: + # TODO clean this up + newelements = [FormField(form, field) for field in form.elements if should_keep(field)] + form.elements = [ff.field for ff in newelements] + + return forms + +@click.command() +@click.option('--encoding', default='', help='Overrides the character encoding of the document.', metavar='ENCODING') +@click.option('--fallback-encoding', default='utf-8', help='Sets the encoding used if the encoding can\'t be determined by another means. Ignored if --encoding is used.', show_default=True, metavar='ENCODING') +@click.option('--hide-ua', is_flag=True, help='Prevent sending the User-Agent string.', expose_value=False, is_eager=True, callback=hide_ua) +@click.version_option(version=VERSION, prog_name="htformtool") +@click.argument('url') +def htformtool(encoding, fallback_encoding, url): + sess.headers.update(base_headers) + sess.headers.update({'Accept-Charset': encoding or fallback_encoding}) + r = sess.get(url) + try: + if not click.confirm("Using {}, continue?".format(r.url), default=True): + return + except click.Abort: + click.echo("") # just a newline + return + try: + if r.headers['Content-Type'].startswith('text/html'): + isXML = False + elif r.headers['Content-Type'].startswith('application/xhtml+xml'): + isXML = True + else: + # not an acceptable content type, ignore it. + click.echo("No HTML found.") + return + except IndexError: + # no content-type, just assume HTML. it's probably good enough for our needs. + isXML = None + if encoding: + r.encoding = encoding + if r.encoding is None: + # FIXME + r.encoding = fallback_encoding + soup = BeautifulSoup(r.content, "xml" if isXML else "html5lib") + forms = extract_forms(soup) + if not forms: + click.echo("No forms found.") + else: + click.echo("There are {} forms:".format(len(forms))) + # TODO + click.echo(forms) + + +if __name__ == '__main__': + htformtool() |