33from dataclasses import dataclass , field
44import re
55from typing import ClassVar
6- from urllib .parse import urlparse
76
87
98@dataclass (frozen = True , init = False )
@@ -21,104 +20,196 @@ class Repo:
2120 """The default host to use when constructing a URL from a short name."""
2221
2322 _MAP_SERVICE2URL : ClassVar [dict [str , str ]] = {
24- "gl" : "https://gitlab.com" ,
25- "gls" : "https://gitlab.suse.de" ,
26- "gh" : "https://github.com" ,
2723 "bb" : "https://bitbucket.org" ,
28- "gt" : "https://gitea.com" ,
2924 "cb" : "https://codeberg.org" ,
25+ "gh" : "https://github.com" ,
3026 "ghe" : "https://github.enterprise.com" ,
27+ "git@" : "https://github.com" ,
28+ "gl" : "https://gitlab.com" ,
29+ "gls" : "https://gitlab.suse.de" ,
30+ "gt" : "https://gitea.com" ,
3131 }
32+
3233 _MAP_URL2SERVICE : ClassVar [dict [str , str ]] = {
33- v : k for k , v in _MAP_SERVICE2URL .items ()
34+ v : k for k , v in _MAP_SERVICE2URL .items () if k != "git@"
3435 }
3536
36- _SSH_PATTERN : ClassVar [re . Pattern ] = re . compile (
37- r"^(?P<user>[^@]+)@(?P<host>[^:]+):(?P<repo>.+?)(?:\.git)?$"
37+ _SERVICES : ClassVar [str ] = "|" . join (
38+ [ k for k in _MAP_SERVICE2URL . keys () if k != "git@" ]
3839 )
3940
40- _SERVICE_PATTERN : ClassVar [re .Pattern ] = re .compile (
41- r"^(?P<abbr>[a-z]{2,4}):/{1,2}(?P<repo>.+?)(?:\.git)?$" , re .IGNORECASE
41+ _REPOS_PATTERN : ClassVar [re .Pattern ] = re .compile (
42+ rf"""^ # Start of string
43+ (?: # Start of URL formats group
44+ # Option 1: HTTPS (Supports .git OR /tree/branch)
45+ (?:
46+ # Capture the repo name *without* the optional trailing
47+ # ".git" so that the canonical URL does not end up with
48+ # a duplicated suffix like "repo.git.git".
49+ (?P<https_schema>https?)://(?P<https_host>[^/]+)/(?P<https_org>[^/]+)/(?P<https_repo>[^/@\s?#]+?)
50+ (?:\.git
51+ |
52+ /tree/(?P<tree_branch>[^/\s?#]+)
53+ )?/?
54+ )
55+ |
56+ # Option 2: SSH
57+ (?:
58+ (?P<ssh_schema>git@)
59+ (?P<ssh_host>[^:]+):(?P<ssh_org>[^/]+)/(?P<ssh_repo>[^/@\s?]+?)(?:\.git)?/?
60+ )
61+ |
62+ # Option 3: Abbreviated protocol-style URL (e.g., gh://, gl://) + optional @branch
63+ (?:
64+ (?:(?P<gh_schema>{ _SERVICES } )://)?
65+ (?P<gh_org>[^/]+)/(?P<gh_repo>[^/@\s?]+?)(?:/|\.git)?
66+ )
67+ ) # End of URL formats group
68+ (?:@(?P<branch>[^@\s]+))? # Consolidated optional @branch suffix
69+ $ # End of string
70+ """ ,
71+ re .VERBOSE | re .IGNORECASE ,
4272 )
73+ """The regex to match for the different URL notations."""
74+
75+ _TREE_PATTERN : ClassVar [dict [str , str ]] = {
76+ "bb" : "https://bitbucket.com/{owner}/{repo}/" ,
77+ "cb" : "https://codeberg.org/{owner}/{repo}/src/branch/{branch}" ,
78+ "gh" : "https://github.com/{owner}/{repo}/tree/{branch}" ,
79+ "ghe" : "https://github.enterprise.com/{owner}/{repo}/tree/{branch}" ,
80+ "git@" : "https://github.com/{owner}/{repo}/tree/{branch}" ,
81+ "gl" : "https://gitlab.com/{owner}/{repo}/-/tree/{branch}" ,
82+ "gls" : "https://gitlab.suse.de/{owner}/{repo}/-/tree/{branch}" ,
83+ "gt" : "https://gitea.com/{owner}/{repo}/src/branch/{branch}" ,
84+ }
85+ """URL template for constructing tree URLs based on the service."""
4386
44- _SCHEMA_PATTERN : ClassVar [ re . Pattern ] = re . compile ( r"(?P<schema>https?):// " )
87+ _default_branches = ( "main" , "master " )
4588
4689 url : str = field (repr = False )
4790 """The full URL of the repository."""
4891
92+ treeurl : str = field (init = False , repr = False )
93+ """The full URL including the branch of the repository."""
94+
4995 surl : str
5096 """The shortened URL version of the repository, for example gh://org/repo for
5197 a GitHub repo."""
5298
5399 name : str = field (init = False , repr = False )
54100 """The abbreviated name of the repository (e.g., 'org/repo')."""
55101
56- def __init__ (self , value : str ) -> None :
102+ branch : str | None = field (init = False , repr = False )
103+ """The branch of the repository"""
104+
105+ origin : str = field (init = False , repr = False )
106+ """The original unchanged URL of the repository."""
107+
108+ def __init__ (self , value : str , default_branch : str | None = None ) -> None :
57109 """Initialize a repository model from a URL or a short name.
58110
111+ :param default_branch: The default branch to use if no branch is specified in the URL.
112+
59113 This initializer understands:
60114
61- * A full URL like ``https://host/org/repo.git``.
62- * A SSH URL like ``git@host:org/repo.git``.
63- * An abbreviated URL like ``gh://org/repo`` for a GitHub URL.
115+ * A full URL like ``https://HOST/ORG/REPO.git`` or a URL pointing
116+ to a branch like ``https://HOST/ORG/REPO/tree/BRANCH``
117+
118+ * A SSH URL like ``git@HOST:ORG/REPO.git``.
119+
120+ * An abbreviated URL like ``SERVICE://ORG/REPO`` or ``SERVICE://ORG/REPO.git``
64121 The service part (before '://') is a two to four letter code:
65- - ``gh`` for GitHub
66- - ``gl`` for GitLab
67- - ``bb`` for BitBucket
68- - ``gt`` for Gitea
69- - ``cb`` for Codeberg
70- - ``ghe`` for GitHub Enterprise
71- * An abbreviated name like ``org/repo`` which defaults to GitHub.
122+ - ``gh`` for GitHub (default)
123+ - ``gl`` for GitLab
124+ - ``bb`` for BitBucket
125+ - ``gt`` for Gitea
126+ - ``cb`` for Codeberg
127+ - ``ghe`` for GitHub Enterprise
128+ This makes the reference to a Git repo more readable.
129+
130+ * A plain notation like ``ORG/REPO`` which defaults to GitHub.
131+
132+ Branches other than default branches (main or master) are added
133+ by ``@BRANCH_NAME`` to the URL.
72134 """
73135 if not value :
74136 raise ValueError ("Repository value cannot be empty." )
75137
76- url : str
77- name : str
78-
79- service_match = self ._SERVICE_PATTERN .match (value )
80- ssh_match = self ._SSH_PATTERN .match (value )
81-
82- if "https://" in value or "http://" in value :
83- parsed_original = urlparse (value .lower ())
84- name = parsed_original .path .strip ("/" ).rsplit (".git" , 1 )[0 ]
85- url = f"{ parsed_original .scheme } ://{ parsed_original .netloc } /{ name } .git"
86- host = f"{ parsed_original .scheme } ://{ parsed_original .netloc } "
87- surl = f"{ self ._MAP_URL2SERVICE .get (host , 'gh' )} ://{ name } "
88-
89- elif service_match :
90- service = service_match .group ("abbr" ).lower ()
91- name = service_match .group ("repo" ).lower ().rstrip ("/" )
92- host = self ._MAP_SERVICE2URL .get (service )
93- if not host :
94- raise ValueError (f"Unknown repo abbreviation: '{ service } '" )
95- url = f"{ host } /{ name } .git"
96- surl = f"{ service } ://{ name } "
97-
98- elif ssh_match :
99- host = ssh_match ["host" ].lower ()
100- name = ssh_match ["repo" ].lower ()
101- name = name .rstrip ("/" )
102- url = f"https://{ host } /{ name } .git"
103- surl = f"{ self ._MAP_URL2SERVICE .get (host , 'gh' )} ://{ name } "
104-
105- elif "/" in value :
106- value = value .lower ()
107- name = value .rsplit (".git" , 1 )[0 ].rstrip ("/" )
108- url = f"{ self .DEFAULT_HOST } /{ name } .git"
109- surl = f"gh://{ name } "
110-
111- else :
138+ # Store the original string
139+ object .__setattr__ (self , "origin" , value )
140+
141+ data = self ._consolidate_match (value .lower ())
142+
143+ # Consolidate data from regex match
144+ name = f"{ data ['org' ]} /{ data ['repo' ]} "
145+ branch = data .get ("branch" )
146+ host = data .get ("host" )
147+ schema = data .get ("schema" )
148+
149+ match schema :
150+ case "http" | "https" :
151+ # For https, a host from regex does not include the schema
152+ service = self ._MAP_URL2SERVICE .get (f"{ schema } ://{ host } " , "gh" )
153+ url = f"{ schema } ://{ host } /{ name } .git"
154+ case "git@" :
155+ # For ssh, map to service and get canonical URL
156+ service = self ._MAP_URL2SERVICE .get (f"https://{ host } " , "gh" )
157+ host = self ._MAP_SERVICE2URL .get (service , self .DEFAULT_HOST )
158+ url = f"{ host } /{ name } .git"
159+ case _:
160+ # For abbreviations (gh://) or bare (org/repo)
161+ service = schema or "gh"
162+ host = self ._MAP_SERVICE2URL .get (service , self .DEFAULT_HOST )
163+ url = f"{ host } /{ name } .git"
164+
165+ # Build URLs
166+ surl = f"{ service } ://{ name } "
167+ if branch :
168+ surl += f"@{ branch } "
169+
170+ # Create the effecive branch for tree URL: prioritize found branch (either
171+ # from /tree/ or @branch), then the given default branch, then defined defaults.
172+ effective_branch = branch or default_branch or self ._default_branches [0 ]
173+ treeurl_template = self ._TREE_PATTERN .get (service , self ._TREE_PATTERN ["gh" ])
174+ treeurl = treeurl_template .format (
175+ owner = data ["org" ], repo = data ["repo" ], branch = effective_branch
176+ )
177+
178+ # Use object.__setattr__ because the dataclass is frozen
179+ object .__setattr__ (self , "url" , url )
180+ object .__setattr__ (self , "treeurl" , treeurl )
181+ object .__setattr__ (self , "name" , name )
182+ object .__setattr__ (self , "surl" , surl )
183+ object .__setattr__ (self , "branch" , branch )
184+
185+ def _consolidate_match (self , value : str ) -> dict :
186+ """Consolidate keys for a cleaner API."""
187+ match = self ._REPOS_PATTERN .match (value )
188+ if not match :
112189 raise ValueError (
113190 f"Invalid repository value: '{ value } '. "
114191 "Expected a full HTTPS URL, SSH URL, abbr notation, "
115192 "or an abbreviated name."
116193 )
194+ raw_data = match .groupdict ()
195+ result = {
196+ "schema" : raw_data .get ("https_schema" )
197+ or raw_data .get ("ssh_schema" )
198+ or raw_data .get ("gh_schema" ),
199+ "host" : raw_data .get ("https_host" ) or raw_data .get ("ssh_host" ),
200+ "org" : raw_data .get ("https_org" )
201+ or raw_data .get ("ssh_org" )
202+ or raw_data .get ("gh_org" ),
203+ "repo" : raw_data .get ("https_repo" )
204+ or raw_data .get ("ssh_repo" )
205+ or raw_data .get ("gh_repo" ),
206+ }
117207
118- # Use object.__setattr__ because the dataclass is frozen
119- object .__setattr__ (self , "url" , url )
120- object .__setattr__ (self , "name" , name )
121- object .__setattr__ (self , "surl" , surl )
208+ # Branch Logic: Prioritize the /tree/ branch, fallback to @branch
209+ branch = raw_data .get ("tree_branch" ) or raw_data .get ("branch" )
210+ result ["branch" ] = branch
211+
212+ return result
122213
123214 def __eq__ (self , other : object ) -> bool :
124215 """Compare Repo with another Repo (by name) or a string (by name)."""
@@ -148,3 +239,24 @@ def slug(self) -> str:
148239 return self .url .translate (
149240 str .maketrans ({":" : "_" , "/" : "_" , "-" : "_" , "." : "_" }),
150241 )
242+
243+
244+ if __name__ == "__main__" :
245+ test_urls = [
246+ "https://github.com/lycheeverse/lychee/tree/relative-link-fixes" , # New #variant
247+ "https://GitHub.com/opensuse/docbuild.git" , # HTTPS no branch
248+ "[email protected] :openSUSE/docbuild.git" ,
# SSH no branch 249+ "gh://openSUSE/docbuild" , # Abbr no branch
250+ "gh://openSUSE/docbuild@v1" ,
251+ ]
252+
253+ for url in test_urls :
254+ repo = Repo (url )
255+ print (f"┌ { url } " )
256+ print (f"├─ { repo .treeurl = } " )
257+ print (f"├─ { repo .surl = } " )
258+ print (f"├─ { repo .name = } " )
259+ print (f"├─ { repo .branch = } " )
260+ print (f"└─ { repo .url = } " )
261+ # print("└")
262+ print ()
0 commit comments