#!/usr/bin/env python
# HTMLGDump - dumps a git repo to html (and symlinks)
# Copyright (C) 2021 Soni L.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
# tl;dr: install this as a git hook (post-receive)
# then configure your webserver and stuff
import dataclasses
import os
import os.path
import pathlib
import shutil
import subprocess
import sys
from urllib.parse import quote
import pygit2
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import get_lexer_for_filename
from pygments.lexers import guess_lexer
from pygments.lexers import guess_lexer_for_filename
import pygments.util
@dataclasses.dataclass
class GitChange:
old_value: str
new_value: str
ref_name: str
deleting: bool = dataclasses.field(init=False)
def __post_init__(self):
self.deleting = self.new_value == "0"*40 or self.new_value == "0"*64
def get_relative(path, target):
"""Makes target relative to path, without filesystem operations."""
return os.path.relpath(target, start=path)
def find_lexer(text, meta):
"""Attempts to find a lexer for the given text/meta."""
# TODO this can probably be improved
# try exact lexers based on filename
# this is by far the fastest, but may lead to incorrect results sometimes.
try:
if len(set(get_lexer_for_filename(f[1]).name for f in meta)) == 1:
lex = get_lexer_for_filename(meta[0][1])
return lex
except pygments.util.ClassNotFound:
pass
# try lexers based on filename and content
try:
if len(set(guess_lexer_for_filename(f[1], text).name for f in meta)) == 1:
lex = guess_lexer_for_filename(meta[0][1], text)
return lex
except pygments.util.ClassNotFound:
pass
# try lexers based only on content
try:
lex = guess_lexer(text)
return lex
except pygments.util.ClassNotFound:
pass
return None
CACHE_HOME = os.environ.get('XDG_CACHE_HOME', '')
if not CACHE_HOME:
CACHE_HOME = os.environ['HOME'] + '/.cache'
CACHE_HOME = CACHE_HOME + "/htmlgdump"
# post-receive runs on $GIT_DIR
repo = pygit2.Repository(os.getcwd())
try:
name = pathlib.Path.cwd().relative_to(repo.config["htmlgdump.base"])
except (KeyError, ValueError):
exit()
changes = [GitChange(*l.rstrip("\n").split(" ", 2)) for l in sys.stdin]
gen_dir = pathlib.Path(CACHE_HOME) / name / "gen"
gen_dir.mkdir(parents=True,exist_ok=True)
todocommits = set()
print("updating refs")
# build changed refs
for c in changes:
path = gen_dir / c.ref_name
if c.deleting:
try:
shutil.rmtree(path)
except FileNotFoundError:
pass
else:
path.mkdir(parents=True,exist_ok=True)
index = path / "index.html"
link = path / "tree"
tree = gen_dir / "trees" / str(repo[c.new_value].tree_id)
with index.open("w") as f:
# TODO
f.write("
refview tree")
todocommits.add(repo[c.new_value])
linktarget = get_relative(path, tree)
link.unlink(missing_ok=True)
link.symlink_to(linktarget, target_is_directory=True)
print("generating refs")
# create missing refs
for ref in repo.references:
ref = repo.references.get(ref)
path = gen_dir / ref.name
path.mkdir(parents=True,exist_ok=True)
index = path / "index.html"
link = path / "tree"
tree = gen_dir / "trees" / str(ref.peel(pygit2.Commit).tree_id)
try:
f = index.open("x")
except FileExistsError:
# check if we've already visited this commit
continue
with f:
# TODO
f.write("refview tree")
todocommits.add(ref.peel(pygit2.Commit))
linktarget = get_relative(path, tree)
link.symlink_to(linktarget, target_is_directory=True)
todotrees = set()
print("generating commits")
# build commits
while todocommits:
c = todocommits.pop()
path = gen_dir / "commits" / str(c.id)
path.mkdir(parents=True,exist_ok=True)
index = path / "index.html"
link = path / "tree"
tree = gen_dir / "trees" / str(c.tree_id)
try:
f = index.open("x")
except FileExistsError:
# check if we've already visited this commit
continue
with f:
# TODO
f.write("commitview tree")
todotrees.add(c.tree)
todocommits.update(c.parents)
linktarget = get_relative(path, tree)
link.symlink_to(linktarget, target_is_directory=True)
# a dict /!\
# maps blobs to some metadata
# FIXME this can get quite expensive with larger repos, and might even run out
# of RAM.
todoblobs = {}
print("generating trees")
# build trees
while todotrees:
t = todotrees.pop()
path = gen_dir / "trees" / str(t.id)
path.mkdir(parents=True,exist_ok=True)
index = path / "index.html"
try:
f = index.open("x")
except FileExistsError:
# check if we've already visited this tree
continue
with f:
f.write("tree")
for obj in t:
linkname = obj.name
# a git repo can contain any file, including index.html among
# others, but you can never make a file conflict with the id of
# the tree it's in. (or at least, it's impractical to do so.)
# hashes are kinda awesome!
# so we just mangle those to not conflict with our own index.html
# note that this does mean the index.html files cannot be easily
# permalinked, sorry.
if linkname == "index.html":
linkname = str(t.id) + "_index.html"
quoted = quote(linkname, safe='')
link = path / linkname
if isinstance(obj, pygit2.Blob):
blobmeta = todoblobs.setdefault(obj, [])
blobmeta += [(obj.filemode, obj.name)]
tree = gen_dir / "blobs" / str(obj.id)
linktarget = get_relative(path, tree)
link.symlink_to(linktarget, target_is_directory=True)
# FIXME html-escape
f.write("- {}
".format(quoted, quoted))
elif isinstance(obj, pygit2.Tree):
todotrees.add(obj)
tree = gen_dir / "trees" / str(obj.id)
linktarget = get_relative(path, tree)
link.symlink_to(linktarget, target_is_directory=True)
# FIXME html-escape
f.write("- {}
".format(quoted, quoted))
else:
# TODO not implemented, sorry. altho apparently submodules use
# commits in trees?
raise TypeError
f.write("
")
print("generating blobs")
# build blobs
while todoblobs:
(b, meta) = todoblobs.popitem()
path = gen_dir / "blobs" / str(b.id)
path.mkdir(parents=True,exist_ok=True)
index = path / "index.html"
try:
f = index.open("x")
except FileExistsError:
# check if we've already visited this tree
continue
with f:
f.write("blob")
f.write("view raw")
try:
text = b.data.decode("utf-8", errors="strict")
lex = find_lexer(text, meta)
if lex is not None:
f.write(highlight(text, lex, HtmlFormatter()))
else:
# TODO maybe just write `text` (html escaped)?
pass
except UnicodeError:
pass
f.write("")
raw = path / "raw.bin"
with raw.open("wb") as f:
f.write(b)
# create index.html
path = gen_dir / "index.html"
with path.open("w") as f:
f.write("index")
if not repo.head_is_unborn:
ref = repo.head
quoted = quote(ref.name, safe='/')
# FIXME html-escape
f.write("- {}
".format(quoted, quoted))
for ref in repo.references:
ref = repo.references.get(ref)
quoted = quote(ref.name, safe='/')
# FIXME html-escape
f.write("- {}
".format(quoted, quoted))
f.write("
")
print("copying to output")
# CANNOT use shutil.copytree - it is broken.
# also need to be aware of copying into a directory, so we just always make it
# a directory.
browse = pathlib.Path.cwd() / "browse"
browse.mkdir(parents=True,exist_ok=True)
subprocess.run(["cp", "-R", "-P", *gen_dir.glob("*"), browse], check=True)
# └── gen
# ├── blobs
# │ └── e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
# │ ├── index.html
# │ └── raw.bin
# ├── commits
# │ ├── 21177a2933b1a9d21d8437159405c5bc68b4d32e
# │ │ ├── index.html
# │ │ └── tree -> ../../trees/1663be45d5f6b9f092c4b98d44cf7992b427172f
# │ └── 3ea9318f6271ece3c7560f18d0b22f50bd3cefe5
# │ ├── index.html
# │ └── tree -> ../../trees/17d6338b3a3dc189bdc3bea8481fe5f32fd388c8
# ├── refs
# │ └── heads
# │ └── default
# │ ├── index.html
# │ └── tree -> ../../../trees/1663be45d5f6b9f092c4b98d44cf7992b427172f
# └── trees
# ├── 1663be45d5f6b9f092c4b98d44cf7992b427172f
# │ ├── bar -> ../../blobs/e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
# │ ├── baz -> ../29ba47b07d262ad717095f2d94ec771194c4c083
# │ ├── deleteme -> ../../blobs/e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
# │ ├── foo -> ../../blobs/e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
# │ └── index.html
# ├── 17d6338b3a3dc189bdc3bea8481fe5f32fd388c8
# │ ├── bar -> ../../blobs/e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
# │ ├── baz -> ../29ba47b07d262ad717095f2d94ec771194c4c083
# │ ├── foo -> ../../blobs/e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
# │ └── index.html
# └── 29ba47b07d262ad717095f2d94ec771194c4c083
# ├── index.html
# └── qux -> ../../blobs/e69de29bb2d1d6434b8b29ae775ad8c2e48c5391