Skip to content

Commit 55d9c9d

Browse files
Refactor Repo and introduce branch detection (#167)
* Refactor Repo and introduce branch detection * Rework former regular expressions for HTTPS, SSH, abbreviated and plain notations into a single regex * Introduce three new attributes to `Repo`: * `branch`: The branch of the repository. For example in `gh://org/repo@branch` Similar to other notations. * `origin`: The original unchainged URL of the repository. * `treeurl`: The full URL including the branch of the repository. This was necessary as the URLs differ from each Git service. * Introduce `_TREE_PATTERN` as a template for constructing tree-based URLs based on service. * Extended `__init__` to allow `default_branch` argument (default is `None`) For URLs like `https://HOST/ORG/REPO.git` there is no branch to operate on. * Refactor the distinction logic between URLs from HTTPS, SSH, abbreviated, or plain. Use a `match`...`case` block. * Fix wrong notation of repo URL in `tests/utils/test_git.py` --------- Co-authored-by: Sushant Gaurav <[email protected]>
1 parent 29d2fb8 commit 55d9c9d

4 files changed

Lines changed: 237 additions & 76 deletions

File tree

changelog.d/167.refactor.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Refactor how Git repository URLs are parsed and handled.
2+
Previously, multiple regular expressions were used to identify different
3+
URL formats (like HTTPS, SSH, plain, and abbreviated notations).
4+
These have now been consolidated into a single, more robust regular expression.
5+
6+
Additionally, the :class:`~docbuild.model.repo.Repo` class was
7+
enhanced with new attributes (:attr:`Repo.branch`, :attr:`Repo.origin`,
8+
:attr:`Repo.treeurl`) to better manage repository details.

src/docbuild/models/repo.py

Lines changed: 174 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from dataclasses import dataclass, field
44
import re
55
from typing import ClassVar
6-
from urllib.parse import urlparse
76

87

98
@dataclass(frozen=True, init=False)
@@ -21,104 +20,196 @@ class Repo:
2120
"""The default host to use when constructing a URL from a short name."""
2221

2322
_MAP_SERVICE2URL: ClassVar[dict[str, str]] = {
24-
"gl": "https://gitlab.com",
25-
"gls": "https://gitlab.suse.de",
26-
"gh": "https://github.com",
2723
"bb": "https://bitbucket.org",
28-
"gt": "https://gitea.com",
2924
"cb": "https://codeberg.org",
25+
"gh": "https://github.com",
3026
"ghe": "https://github.enterprise.com",
27+
"git@": "https://github.com",
28+
"gl": "https://gitlab.com",
29+
"gls": "https://gitlab.suse.de",
30+
"gt": "https://gitea.com",
3131
}
32+
3233
_MAP_URL2SERVICE: ClassVar[dict[str, str]] = {
33-
v: k for k, v in _MAP_SERVICE2URL.items()
34+
v: k for k, v in _MAP_SERVICE2URL.items() if k != "git@"
3435
}
3536

36-
_SSH_PATTERN: ClassVar[re.Pattern] = re.compile(
37-
r"^(?P<user>[^@]+)@(?P<host>[^:]+):(?P<repo>.+?)(?:\.git)?$"
37+
_SERVICES: ClassVar[str] = "|".join(
38+
[k for k in _MAP_SERVICE2URL.keys() if k != "git@"]
3839
)
3940

40-
_SERVICE_PATTERN: ClassVar[re.Pattern] = re.compile(
41-
r"^(?P<abbr>[a-z]{2,4}):/{1,2}(?P<repo>.+?)(?:\.git)?$", re.IGNORECASE
41+
_REPOS_PATTERN: ClassVar[re.Pattern] = re.compile(
42+
rf"""^ # Start of string
43+
(?: # Start of URL formats group
44+
# Option 1: HTTPS (Supports .git OR /tree/branch)
45+
(?:
46+
# Capture the repo name *without* the optional trailing
47+
# ".git" so that the canonical URL does not end up with
48+
# a duplicated suffix like "repo.git.git".
49+
(?P<https_schema>https?)://(?P<https_host>[^/]+)/(?P<https_org>[^/]+)/(?P<https_repo>[^/@\s?#]+?)
50+
(?:\.git
51+
|
52+
/tree/(?P<tree_branch>[^/\s?#]+)
53+
)?/?
54+
)
55+
|
56+
# Option 2: SSH
57+
(?:
58+
(?P<ssh_schema>git@)
59+
(?P<ssh_host>[^:]+):(?P<ssh_org>[^/]+)/(?P<ssh_repo>[^/@\s?]+?)(?:\.git)?/?
60+
)
61+
|
62+
# Option 3: Abbreviated protocol-style URL (e.g., gh://, gl://) + optional @branch
63+
(?:
64+
(?:(?P<gh_schema>{_SERVICES})://)?
65+
(?P<gh_org>[^/]+)/(?P<gh_repo>[^/@\s?]+?)(?:/|\.git)?
66+
)
67+
) # End of URL formats group
68+
(?:@(?P<branch>[^@\s]+))? # Consolidated optional @branch suffix
69+
$ # End of string
70+
""",
71+
re.VERBOSE | re.IGNORECASE,
4272
)
73+
"""The regex to match for the different URL notations."""
74+
75+
_TREE_PATTERN: ClassVar[dict[str, str]] = {
76+
"bb": "https://bitbucket.com/{owner}/{repo}/",
77+
"cb": "https://codeberg.org/{owner}/{repo}/src/branch/{branch}",
78+
"gh": "https://github.com/{owner}/{repo}/tree/{branch}",
79+
"ghe": "https://github.enterprise.com/{owner}/{repo}/tree/{branch}",
80+
"git@": "https://github.com/{owner}/{repo}/tree/{branch}",
81+
"gl": "https://gitlab.com/{owner}/{repo}/-/tree/{branch}",
82+
"gls": "https://gitlab.suse.de/{owner}/{repo}/-/tree/{branch}",
83+
"gt": "https://gitea.com/{owner}/{repo}/src/branch/{branch}",
84+
}
85+
"""URL template for constructing tree URLs based on the service."""
4386

44-
_SCHEMA_PATTERN: ClassVar[re.Pattern] = re.compile(r"(?P<schema>https?)://")
87+
_default_branches = ("main", "master")
4588

4689
url: str = field(repr=False)
4790
"""The full URL of the repository."""
4891

92+
treeurl: str = field(init=False, repr=False)
93+
"""The full URL including the branch of the repository."""
94+
4995
surl: str
5096
"""The shortened URL version of the repository, for example gh://org/repo for
5197
a GitHub repo."""
5298

5399
name: str = field(init=False, repr=False)
54100
"""The abbreviated name of the repository (e.g., 'org/repo')."""
55101

56-
def __init__(self, value: str) -> None:
102+
branch: str | None = field(init=False, repr=False)
103+
"""The branch of the repository"""
104+
105+
origin: str = field(init=False, repr=False)
106+
"""The original unchanged URL of the repository."""
107+
108+
def __init__(self, value: str, default_branch: str | None = None) -> None:
57109
"""Initialize a repository model from a URL or a short name.
58110
111+
:param default_branch: The default branch to use if no branch is specified in the URL.
112+
59113
This initializer understands:
60114
61-
* A full URL like ``https://host/org/repo.git``.
62-
* A SSH URL like ``git@host:org/repo.git``.
63-
* An abbreviated URL like ``gh://org/repo`` for a GitHub URL.
115+
* A full URL like ``https://HOST/ORG/REPO.git`` or a URL pointing
116+
to a branch like ``https://HOST/ORG/REPO/tree/BRANCH``
117+
118+
* A SSH URL like ``git@HOST:ORG/REPO.git``.
119+
120+
* An abbreviated URL like ``SERVICE://ORG/REPO`` or ``SERVICE://ORG/REPO.git``
64121
The service part (before '://') is a two to four letter code:
65-
- ``gh`` for GitHub
66-
- ``gl`` for GitLab
67-
- ``bb`` for BitBucket
68-
- ``gt`` for Gitea
69-
- ``cb`` for Codeberg
70-
- ``ghe`` for GitHub Enterprise
71-
* An abbreviated name like ``org/repo`` which defaults to GitHub.
122+
- ``gh`` for GitHub (default)
123+
- ``gl`` for GitLab
124+
- ``bb`` for BitBucket
125+
- ``gt`` for Gitea
126+
- ``cb`` for Codeberg
127+
- ``ghe`` for GitHub Enterprise
128+
This makes the reference to a Git repo more readable.
129+
130+
* A plain notation like ``ORG/REPO`` which defaults to GitHub.
131+
132+
Branches other than default branches (main or master) are added
133+
by ``@BRANCH_NAME`` to the URL.
72134
"""
73135
if not value:
74136
raise ValueError("Repository value cannot be empty.")
75137

76-
url: str
77-
name: str
78-
79-
service_match = self._SERVICE_PATTERN.match(value)
80-
ssh_match = self._SSH_PATTERN.match(value)
81-
82-
if "https://" in value or "http://" in value:
83-
parsed_original = urlparse(value.lower())
84-
name = parsed_original.path.strip("/").rsplit(".git", 1)[0]
85-
url = f"{parsed_original.scheme}://{parsed_original.netloc}/{name}.git"
86-
host = f"{parsed_original.scheme}://{parsed_original.netloc}"
87-
surl = f"{self._MAP_URL2SERVICE.get(host, 'gh')}://{name}"
88-
89-
elif service_match:
90-
service = service_match.group("abbr").lower()
91-
name = service_match.group("repo").lower().rstrip("/")
92-
host = self._MAP_SERVICE2URL.get(service)
93-
if not host:
94-
raise ValueError(f"Unknown repo abbreviation: '{service}'")
95-
url = f"{host}/{name}.git"
96-
surl = f"{service}://{name}"
97-
98-
elif ssh_match:
99-
host = ssh_match["host"].lower()
100-
name = ssh_match["repo"].lower()
101-
name = name.rstrip("/")
102-
url = f"https://{host}/{name}.git"
103-
surl = f"{self._MAP_URL2SERVICE.get(host, 'gh')}://{name}"
104-
105-
elif "/" in value:
106-
value = value.lower()
107-
name = value.rsplit(".git", 1)[0].rstrip("/")
108-
url = f"{self.DEFAULT_HOST}/{name}.git"
109-
surl = f"gh://{name}"
110-
111-
else:
138+
# Store the original string
139+
object.__setattr__(self, "origin", value)
140+
141+
data = self._consolidate_match(value.lower())
142+
143+
# Consolidate data from regex match
144+
name = f"{data['org']}/{data['repo']}"
145+
branch = data.get("branch")
146+
host = data.get("host")
147+
schema = data.get("schema")
148+
149+
match schema:
150+
case "http" | "https":
151+
# For https, a host from regex does not include the schema
152+
service = self._MAP_URL2SERVICE.get(f"{schema}://{host}", "gh")
153+
url = f"{schema}://{host}/{name}.git"
154+
case "git@":
155+
# For ssh, map to service and get canonical URL
156+
service = self._MAP_URL2SERVICE.get(f"https://{host}", "gh")
157+
host = self._MAP_SERVICE2URL.get(service, self.DEFAULT_HOST)
158+
url = f"{host}/{name}.git"
159+
case _:
160+
# For abbreviations (gh://) or bare (org/repo)
161+
service = schema or "gh"
162+
host = self._MAP_SERVICE2URL.get(service, self.DEFAULT_HOST)
163+
url = f"{host}/{name}.git"
164+
165+
# Build URLs
166+
surl = f"{service}://{name}"
167+
if branch:
168+
surl += f"@{branch}"
169+
170+
# Create the effecive branch for tree URL: prioritize found branch (either
171+
# from /tree/ or @branch), then the given default branch, then defined defaults.
172+
effective_branch = branch or default_branch or self._default_branches[0]
173+
treeurl_template = self._TREE_PATTERN.get(service, self._TREE_PATTERN["gh"])
174+
treeurl = treeurl_template.format(
175+
owner=data["org"], repo=data["repo"], branch=effective_branch
176+
)
177+
178+
# Use object.__setattr__ because the dataclass is frozen
179+
object.__setattr__(self, "url", url)
180+
object.__setattr__(self, "treeurl", treeurl)
181+
object.__setattr__(self, "name", name)
182+
object.__setattr__(self, "surl", surl)
183+
object.__setattr__(self, "branch", branch)
184+
185+
def _consolidate_match(self, value: str) -> dict:
186+
"""Consolidate keys for a cleaner API."""
187+
match = self._REPOS_PATTERN.match(value)
188+
if not match:
112189
raise ValueError(
113190
f"Invalid repository value: '{value}'. "
114191
"Expected a full HTTPS URL, SSH URL, abbr notation, "
115192
"or an abbreviated name."
116193
)
194+
raw_data = match.groupdict()
195+
result = {
196+
"schema": raw_data.get("https_schema")
197+
or raw_data.get("ssh_schema")
198+
or raw_data.get("gh_schema"),
199+
"host": raw_data.get("https_host") or raw_data.get("ssh_host"),
200+
"org": raw_data.get("https_org")
201+
or raw_data.get("ssh_org")
202+
or raw_data.get("gh_org"),
203+
"repo": raw_data.get("https_repo")
204+
or raw_data.get("ssh_repo")
205+
or raw_data.get("gh_repo"),
206+
}
117207

118-
# Use object.__setattr__ because the dataclass is frozen
119-
object.__setattr__(self, "url", url)
120-
object.__setattr__(self, "name", name)
121-
object.__setattr__(self, "surl", surl)
208+
# Branch Logic: Prioritize the /tree/ branch, fallback to @branch
209+
branch = raw_data.get("tree_branch") or raw_data.get("branch")
210+
result["branch"] = branch
211+
212+
return result
122213

123214
def __eq__(self, other: object) -> bool:
124215
"""Compare Repo with another Repo (by name) or a string (by name)."""
@@ -148,3 +239,24 @@ def slug(self) -> str:
148239
return self.url.translate(
149240
str.maketrans({":": "_", "/": "_", "-": "_", ".": "_"}),
150241
)
242+
243+
244+
if __name__ == "__main__":
245+
test_urls = [
246+
"https://github.com/lycheeverse/lychee/tree/relative-link-fixes", # New #variant
247+
"https://GitHub.com/opensuse/docbuild.git", # HTTPS no branch
248+
"[email protected]:openSUSE/docbuild.git", # SSH no branch
249+
"gh://openSUSE/docbuild", # Abbr no branch
250+
"gh://openSUSE/docbuild@v1",
251+
]
252+
253+
for url in test_urls:
254+
repo = Repo(url)
255+
print(f"┌ {url}")
256+
print(f"├─ {repo.treeurl=}")
257+
print(f"├─ {repo.surl=}")
258+
print(f"├─ {repo.name=}")
259+
print(f"├─ {repo.branch=}")
260+
print(f"└─ {repo.url=}")
261+
# print("└")
262+
print()

tests/models/test_repo.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ def repo_url(request) -> str:
3838
"org/repo_git",
3939
"https://github.com/org/repo_git.git",
4040
),
41+
# 7
42+
("http://a.b/org/c.git", "org/c", "http://a.b/org/c.git"),
4143
],
4244
)
4345
def test_repo_https(input_value, name, url):
@@ -153,6 +155,45 @@ def test_repo_abbreviated(input_value, name, url):
153155
assert repo.url == url
154156

155157

158+
@pytest.mark.parametrize(
159+
"input_value, name, branch, expected_surl, expected_tree",
160+
[
161+
# 1
162+
(
163+
"org/repo@main",
164+
"org/repo",
165+
"main",
166+
"gh://org/repo@main",
167+
"https://github.com/org/repo/tree/main",
168+
),
169+
# 2
170+
(
171+
"ORG/repo@develop",
172+
"org/repo",
173+
"develop",
174+
"gh://org/repo@develop",
175+
"https://github.com/org/repo/tree/develop",
176+
),
177+
# 3
178+
(
179+
180+
"org/repo_git",
181+
"v1.2.4",
182+
"gh://org/[email protected]",
183+
"https://github.com/org/repo_git/tree/v1.2.4",
184+
),
185+
# 4
186+
],
187+
)
188+
def test_repo_with_branch(input_value, name, branch, expected_surl, expected_tree):
189+
repo = Repo(input_value)
190+
assert repo.name == name
191+
assert repo.branch == branch
192+
assert repo.surl == expected_surl
193+
assert repo.treeurl == expected_tree
194+
assert repo.origin == input_value
195+
196+
156197
def test_repo_with_empty_value():
157198
with pytest.raises(ValueError):
158199
Repo("")

0 commit comments

Comments
 (0)