102 lines
3.1 KiB
Python
102 lines
3.1 KiB
Python
|
"""Domain."""
|
||
|
|
||
|
# standard
|
||
|
from os import environ
|
||
|
from pathlib import Path
|
||
|
import re
|
||
|
from typing import Optional, Set
|
||
|
|
||
|
# local
|
||
|
from .utils import validator
|
||
|
|
||
|
|
||
|
class _IanaTLD:
|
||
|
"""Read IANA TLDs, and optionally cache them."""
|
||
|
|
||
|
_full_cache: Optional[Set[str]] = None
|
||
|
# source: https://www.statista.com/statistics/265677
|
||
|
_popular_cache = {"COM", "ORG", "RU", "DE", "NET", "BR", "UK", "JP", "FR", "IT"}
|
||
|
_popular_cache.add("ONION")
|
||
|
|
||
|
@classmethod
|
||
|
def _retrieve(cls):
|
||
|
with Path(__file__).parent.joinpath("_tld.txt").open() as tld_f:
|
||
|
_ = next(tld_f) # ignore the first line
|
||
|
for line in tld_f:
|
||
|
yield line.strip()
|
||
|
|
||
|
@classmethod
|
||
|
def check(cls, tld: str):
|
||
|
if tld in cls._popular_cache:
|
||
|
return True
|
||
|
if cls._full_cache is None:
|
||
|
if environ.get("PYVLD_CACHE_TLD") == "True":
|
||
|
cls._full_cache = set(cls._retrieve())
|
||
|
else:
|
||
|
return tld in cls._retrieve()
|
||
|
return tld in cls._full_cache
|
||
|
|
||
|
|
||
|
@validator
|
||
|
def domain(
|
||
|
value: str, /, *, consider_tld: bool = False, rfc_1034: bool = False, rfc_2782: bool = False
|
||
|
):
|
||
|
"""Return whether or not given value is a valid domain.
|
||
|
|
||
|
Examples:
|
||
|
>>> domain('example.com')
|
||
|
True
|
||
|
>>> domain('example.com/')
|
||
|
ValidationError(func=domain, args={'value': 'example.com/'})
|
||
|
>>> # Supports IDN domains as well::
|
||
|
>>> domain('xn----gtbspbbmkef.xn--p1ai')
|
||
|
True
|
||
|
|
||
|
Args:
|
||
|
value:
|
||
|
Domain string to validate.
|
||
|
consider_tld:
|
||
|
Restrict domain to TLDs allowed by IANA.
|
||
|
rfc_1034:
|
||
|
Allows optional trailing dot in the domain name.
|
||
|
Ref: [RFC 1034](https://www.rfc-editor.org/rfc/rfc1034).
|
||
|
rfc_2782:
|
||
|
Domain name is of type service record.
|
||
|
Allows optional underscores in the domain name.
|
||
|
Ref: [RFC 2782](https://www.rfc-editor.org/rfc/rfc2782).
|
||
|
|
||
|
|
||
|
Returns:
|
||
|
(Literal[True]): If `value` is a valid domain name.
|
||
|
(ValidationError): If `value` is an invalid domain name.
|
||
|
|
||
|
Raises:
|
||
|
(UnicodeError): If `value` cannot be encoded into `idna` or decoded into `utf-8`.
|
||
|
"""
|
||
|
if not value:
|
||
|
return False
|
||
|
|
||
|
if consider_tld and not _IanaTLD.check(value.rstrip(".").rsplit(".", 1)[-1].upper()):
|
||
|
return False
|
||
|
|
||
|
try:
|
||
|
service_record = r"_" if rfc_2782 else ""
|
||
|
trailing_dot = r"\.?$" if rfc_1034 else r"$"
|
||
|
|
||
|
return not re.search(r"\s|__+", value) and re.match(
|
||
|
# First character of the domain
|
||
|
rf"^(?:[a-z0-9{service_record}]"
|
||
|
# Sub-domain
|
||
|
+ rf"(?:[a-z0-9-{service_record}]{{0,61}}"
|
||
|
# Hostname
|
||
|
+ rf"[a-z0-9{service_record}])?\.)"
|
||
|
# First 61 characters of the gTLD
|
||
|
+ r"+[a-z0-9][a-z0-9-_]{0,61}"
|
||
|
# Last character of the gTLD
|
||
|
+ rf"[a-z]{trailing_dot}",
|
||
|
value.encode("idna").decode("utf-8"),
|
||
|
re.IGNORECASE,
|
||
|
)
|
||
|
except UnicodeError as err:
|
||
|
raise UnicodeError(f"Unable to encode/decode {value}") from err
|