from __future__ import annotations
from functools import lru_cache
import ipaddress
from typing import Dict, List, Sequence, Tuple, Type, Pattern
from urllib.parse import urlparse, ParseResult
from lchttp.uri import (
    decode,
    parse_host,
    parse_query_string,
)
from publicsuffixlist import PublicSuffixList
from .cache import (
    cached_property,
)
from .mimes import PATH_MIMES, HOSTNAME_MIMES, URL_MIMES
from .mimetypes import Extension

from .settings import STOPWORDS, re_icompile
from .typehints import URLPath, UrlDetails

REPLACE = re_icompile(r'_|-')
VALID_URL = re_icompile(
    r'(ftp|https?)://(\w+:{0,1}\w*@)?(\S+)(:[0-9]+)?(\/|\/([\w#!:.?+=&%@!\-\/]))?'
)

# Match all-numeric search terms or mostly numeric terms
# that start / end with alphabetic characters.
# IOW, part number type searches aren't interesting.
PART_NUMBERS = re_icompile(r'^[a-z]?[0-9 !\"#$%&\'()*+,-./:;<=>?@\[\]^_{|}~]*[a-z]?$')

VOWELS = re_icompile(r'[aeiouy]')

public_suffix = PublicSuffixList()


@lru_cache(maxsize=512)
def url_parse(url):
    return urlparse(url)


trim_prefixes = (
    'analytics.',
    'app.',
    'apps.',
    'asset.',
    'assets.',
    'login.',
    'signin.',
    'signup.',
    'click.',
    'log.',
    'www.',
    'g.',
    'i.',
    'm.',
    'mi.',
    'dl.',
    'cdn.',
    'img.',
    'images.',
    'fonts.',
    'ocsp.',
    'crl.',
    'ssl.',
    'pki.',
    'pub.',
    'ws.',
    'oauth.',
    'oauth2.',
    'css.',
    'js.',
    'js-cdn.',
    'link.',
    'links.',
    'media.com',
    'static.',
    'static-cdn.',
    'staticcdn.',
    'stats.',
    'status.',
    'sdk.',
    'tag.',
    'tags.',
    'metrics.',
    'telemetry.',
    'track.',
    'tracker.',
    'tracking.',
    'widget.',
    'widgets.',
)

normalize_domains = (
    '.apple.com',
    '.microsoft.com',
    '.amazon-adsystem.com',
    '.ggpht.com',
    '.gvt1.com',
    '.goo.gl',
    '.google.com',
    '.googleapis.com',
    '.googlevideo.com',
    '.googleusercontent.com',
    '.virtualearth.net',
    '.dropbox.com',
    '.live.com',
    '.mapbox.com',
    '.windows.com',
    '.windows.net',
    '.gstatic.com',
    '.icloud.com',
    '.icloud-content.com',
    '.clearsdn.com',
    '.cloudfront.net',
    '.mcafee.com',
    '.weatherbug.net',
    '.whatsapp.net',
    '.akstat.io',
    '.windowsupdate.com',
    '.intuit.com',
    '.quickbooks.com',
    '.doubleclick.net',
    '.optimizely.com',
    '.screenconnect.com',
    '.ebaystatic.com',
    '.xboxlive.com',
    '.online-metrix.net',
    '.akamaihd.net',
    '.firebaseio.com',
)

retain_suffixes = (
    'amazon-adsystem.com',
    'phobos.apple.com',
    'smoot.apple.com',
    'ssl.ls.apple.com',
    'buy.itunes.apple.com',
    'smp-device.apple.com',
    'mp.microsoft.com',
    'telemetry.microsoft.com',
    'events.data.microsoft.com',
    'ws.microsoft.com',
    'update.microsoft.com',
    'googlevideo.com',
    'app.goo.gl',
    'gvt1.com',
    'ggpht.com',
    'googleusercontent.com',
    'performance.dropbox.com',
    'activity.windows.com',
    'metric.gstatic.com',
    'icloud-content.com',
    'cloudfront.net',
    'mcafee.com',
    'akstat.io',
    'firebaseio.com',
    's.akamaihd.net',
    'cdn.optimizely.com',
    'doubleclick.net',
    'clearsdn.com',
    'fna.whatsapp.net',
    'cdn.whatsapp.net',
    'quickbooks.com',
    'qbo.intuit.com',
    'a.intuit.com',
    'screenconnect.com',
    'aa.online-metrix.net',
    'api.weatherbug.net',
    'weather.com',
    'pulse.weatherbug.net',
    'tiles.mapbox.com',
    'tiles.virtualearth.net',
    'dev.virtualearth.net',
    'storage.live.com',
    'officeapps.live.com',
    'ssl.xboxlive.com',
    'blob.core.windows.net',
    'servicebus.windows.net',
    'download.windowsupdate.com',
)


@lru_cache(maxsize=1024)
def normalize_hostname(name: str):
    """
    Normalize hostname from to reduce unique entries,
    especially for DNS resolver logs.

    Entries such as:
        a1030.phobos.apple.com
        a1153.phobos.apple.com
        a1257.phobos.apple.com
    can be squashed to:
        phobos.apple.com
    for more compact reporting.
    """

    if len(name) > 6 and name.startswith(trim_prefixes):
        dot = name.index('.') + 1
        return name[dot:]

    if not name.endswith(normalize_domains):
        return name

    # if we only want the suffix, check in descending order of specificity
    for suffix in retain_suffixes:
        if name.endswith(suffix):
            return suffix

    for grp, rgx in (

            # api1.weather.com
            # api20.apple.com
        (r'\g<1>\g<2>', re_icompile(r'(api)\d*(.\w+.com)')),

            # 22-courier.push.apple.com
            # 23-courier2.push.apple.com
        (r'\g<1>\g<2>', re_icompile(r'\d+-(courier)\d*(.push.apple.com)')),

            # ld-8.itunes.apple.com
        (r'\g<1>', re_icompile(r'[a-z]{1,2}-\d+.(itunes.apple.com)')),

            # cl2.apple.com
            # cl5.apple.com
        (r'\g<1>', re_icompile(r'[a-z]{1,2}\d+.(apple.com)')),

            # firebaselogging.googleapis.com
            # firebasedynamiclinks-ipv4.googleapis.com
        (r'\g<1>\g<2>', re_icompile(r'(firebase).*(.googleapis.com)')),

            # khms0.googleapis.com
            # khms1.googleapis.com
        (r'\g<1>\g<2>', re_icompile(r'(khms)\d+(.googleapis.com)')),

            # clients1.google.com
            # lh5.google.com
            # jmt17.google.com
        (r'\g<1>\g<2>', re_icompile(r'(clients|lh|jmt)\d+(.google.com)')),

            # api-content.dropbox.com
            # api-d.dropbox.com
        (r'\g<1>\g<2>', re_icompile(r'(api)-.*(.dropbox.com)')),

            # encrypted-tbn1.gstatic.com
            # encrypted-vbtn3.gstatic.com
        (r'\g<1>\g<2>', re_icompile(r'(encrypted)-.*(.gstatic.com)')),

            # t0.gstatic.com
            # t1.gstatic.com
        (r'\g<1>', re_icompile(r'[a-z]\d+.(gstatic.com)')),

            # p55-fmf.icloud.com
            # p51-caldav.icloud.com
        (r'\g<1>\g<2>', re_icompile(r'[a-z]\d+-(\w+).*(.icloud.com)')),

            # thumbs1.ebaystatic.com
        (r'\g<1>\g<2>', re_icompile(r'(thumbs)\d.*(.ebaystatic.com)')),
    ):
        match = rgx.sub(grp, name)
        if match != name:
            return match

    return name


def clean_string(name: str) -> str:
    """
    Remove hyphens, underscores and translate URL encodings

    Remove crufty searches that consist only of things like:
        * less than 3 characters
        * stop words
        * part numbers
        * only consonants
    """
    if not name:
        return ''

    if isinstance(name, (tuple, list)):
        name = ' '.join(name)

    name = name.strip()

    if len(name) < 3:
        return ''

    if name in STOPWORDS:
        return ''

    if PART_NUMBERS.match(name):
        return ''

    if not VOWELS.search(name):
        return ''

    name = decode(name, unquote_plus=True)
    name = REPLACE.sub(' ', name)

    return name


def verify_schema(url: str) -> str:
    """
    Make sure URL string contains schema
    """
    if url.startswith(('http:', 'https:')):
        return url

    if url.startswith('//'):
        return f'http:{url}'

    return f'http://{url}'


@lru_cache(maxsize=256)
def get_domain_name(hostname: str) -> Tuple[str, str]:
    """
    Return domain name or IP address from hostname or URL string
    """
    try:
        hostname, _ = parse_host(hostname)
    except ValueError:
        pass

    if not hostname.startswith('www'):
        try:
            ipaddress.ip_address(hostname)
            return hostname, hostname
        except ValueError:
            pass

    try:
        domain_name = public_suffix.privatesuffix(hostname)
    except:
        print(f'Domain name could not be parsed from {hostname}')
        domain_name = ''

    if hostname.startswith('www.'):
        return hostname[4:], domain_name

    return hostname, domain_name


def get_domain_path_keys(domain: str, path: str) -> List[str]:
    """
    Return list of domain/path values where path gets ever shorter.

    domain = 'google.com'
    path = '/maps/place'

    return [
        'google.com/maps/place/',
        'google.com/maps/',
    ]
    """
    path_split = [d for d in path.split('/') if d]

    # don't include the filename of the path
    if path_split and '.' in path_split[-1]:
        path_split = path_split[:-1]

    keys = []

    while path_split:
        pp = '/'.join(path_split)
        keys.append(f'{domain}/{pp}')
        path_split.pop()

    return keys


class Extractor:
    Subclasses: Dict[str, Type[Extractor]] = {}

    # For this extractor to be selected for parsing a URL,
    # hostname.domain_name.tld, domain_name.tld, domain_name
    # or domain + path must match this string
    domain_name: str = ''

    # Exclude all URLs matching these patterns as not being of interest.
    # Such URLs might include query keys that aren't user generated, for example
    exclude_patterns: Tuple[Pattern, ...] = ()

    # Query Param key that is most likely to
    # represent a user-entered search term
    search_keys: Tuple[str, ...] = ()

    # regex patterns will _only_ match against the path!
    search_patterns: Tuple[URLPath, ...] = ()

    # regex patterns will _only_ match against the fragment!
    fragment_patterns: Tuple[URLPath, ...] = ()

    # Some URLs have the page title embedded in the URL
    # regex patterns will _only_ match against the path!
    title_patterns: Tuple[URLPath, ...] = ()
    min_title_length = 0

    # Viewing media generates many URLs which contain the Media ID but
    # aren't the direct link to the video. Set the base URL here so the
    # direct link can be derived from any related URL for user convenience.
    base_media_download_url = ''

    def __init__(self, url: str, hostname: str, domain_name: str, parse_url: ParseResult) -> None:

        self.url = url
        self.parse_url = parse_url
        self.domain_name = domain_name
        self.hostname = hostname

    def __init_subclass__(cls: Type[Extractor], **kwargs: dict) -> None:
        """
        Include all child subclasses, including subclasses of subclasses
        to make an easy lookup dict, to retrieve the correct Extractor
        for the supplied URL.
        """
        cls.Subclasses[cls.domain_name] = cls

    @classmethod
    def getExtractor(cls, url: str):
        """Return the correct Extractor class for the URL."""
        url = verify_schema(url.strip())
        if VALID_URL.match(url) is None:
            raise ValueError(f'Not a valid URL: {url!r}')

        parse_url = url_parse(url)

        hostname, domain_tld = get_domain_name(hostname=parse_url.netloc)
        try:
            domain = domain_tld.split('.')[0]
        except AttributeError:
            domain = domain_tld = hostname

        domain_keys = get_domain_path_keys(domain_tld, parse_url.path.lower())

        if hostname != domain_tld:
            tld = '.'.join(domain_tld.split('.')[1:])
            host_domain = hostname[:-len(tld) - 1]
            domain_keys.extend([
                hostname,  # login.ebay.com
                host_domain,  # login.ebay
                domain_tld,  # ebay.com
                domain,  # ebay
            ])
        else:
            domain_keys.extend([
                hostname,  # ebay.com
                domain,  # ebay
            ])

        for key in domain_keys:
            extractor = Extractor.Subclasses.get(key, None)
            if extractor:
                return extractor(
                    url=url,
                    hostname=hostname,
                    domain_name=domain_tld,
                    parse_url=parse_url,
                )

        # return base class for basic hostname/domain_name functions
        return Extractor(
            url=url,
            hostname=hostname,
            domain_name=domain_tld,
            parse_url=parse_url,
        )

    def __str__(self) -> str:
        return self.url

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}({self.url})'

    @cached_property
    def normalized_hostname(self) -> str:
        return normalize_hostname(self.parse_url.netloc)

    @cached_property
    def normalize_domain_name(self) -> str:
        return normalize_hostname(self.domain_name)

    @cached_property
    def exclude_url(self) -> bool:
        """
        If URL matches an exclude regex, it's not of interest!
        """
        for pattern in self.exclude_patterns:
            if pattern.search(self.url):
                return True
        return False

    @cached_property
    def query_params(self) -> dict:
        """
        Get query params from URL Query params or Fragment, preferring fragment
        """
        if self.exclude_url:
            return {}

        qp = parse_query_string(self.parse_url.query, csv=False)
        if self.parse_url.fragment:
            qp.update(parse_query_string(self.parse_url.fragment, csv=False))

        return qp

    @cached_property
    def search_term_from_query_params(self) -> str:
        """
        Check query params for user-generated search term
        """
        for key in self.search_keys:
            qs = self.query_params.get(key, '')
            if qs:
                return clean_string(qs)
        return ''

    def get_string(self, patterns: Sequence[URLPath], section: str) -> str:
        """
        Get user-generated search term or other
        string of interest from URL path structure
        """
        if self.exclude_url:
            return ''

        for group, pattern in patterns:
            match = pattern.search(section)
            if match:
                return match.group(group)

        return ''

    @cached_property
    def search_term_from_path(self) -> str:
        """
        Get user-generated search term from URL path structure
        """
        return self.get_string(patterns=self.search_patterns, section=self.parse_url.path)

    @cached_property
    def search_term_from_fragment(self) -> str:
        """
        Get user-generated search term or other
        string of interest from URL Fragment structure
        """
        return self.get_string(patterns=self.fragment_patterns, section=self.parse_url.fragment)

    @cached_property
    def search_term(self) -> str:
        """
        Get user-generated search term, removing URL encodings, hyphens and underscores
        """
        if self.exclude_url:
            return ''

        for term_extractor in (
                'search_term_from_query_params',
                'search_term_from_path',
                'search_term_from_fragment',
        ):
            term = clean_string(getattr(self, term_extractor))
            if term:
                return term

        return ''

    @cached_property
    def title(self) -> str:
        """
        Get Page title from URL path
        """
        title = self.get_string(patterns=self.title_patterns, section=self.parse_url.path)
        clean_title = clean_string(title)
        if len(clean_title) >= self.min_title_length:
            return clean_title
        return ''

    @cached_property
    def path_extension(self) -> str:
        """
        Get path extension, handling malformed query params
        """
        path = self.parse_url.path
        last_slash = path.rfind('/') + 1
        if not last_slash:
            return ''

        filename = path[last_slash:]
        dotindex = filename.rfind('.') + 1
        if not dotindex:
            return ''

        # Handle malformed query params
        # sdk.js&version=v2.0, but not Grab&Go-icon.png
        amperindex = filename.find('&')
        if amperindex > dotindex:
            filename = filename[:amperindex]

        return filename[dotindex:]

    @cached_property
    def default_mimetype(self) -> str:
        """
        Most likely mimetype, based on hostname or path patterns.
        Useful when request is blocked, and logline doesn't define mimetype.
        """
        # Check extension mime first, but if it's HTML check Path / Hostname
        # check Path and Host for patterns
        extension_mime = ''
        path_extension = self.path_extension
        if path_extension:
            extension_mime = Extension(path_extension).mimetype()
            if extension_mime and extension_mime != 'text/html':
                return extension_mime

        # paths first because they're more specific than host patterns
        path = self.parse_url.path
        for pattern, mimetype in PATH_MIMES:
            if pattern.search(path):
                return mimetype

        hostname = self.hostname
        for pattern, mimetype in HOSTNAME_MIMES:
            if pattern.search(hostname):
                return mimetype

        for pattern, mimetype in URL_MIMES:
            if pattern.search(self.url):
                return mimetype

        return extension_mime

    def is_tech_service(self) -> bool:
        """
        Request type pertains to background technical services
        such as scripting, encryption checks, etc. Probably not
        directly user-generated.

        Checked when Redwood wasn't able to determine a classifier score.
        """
        return self.hostname.startswith(('login.', 'api.', 'apis.', 'metric.', 'metrics.'))

    def media_id(self):
        """
        Extract the Media ID from the URL.
        """
        return ''

    def playing_media_id(self) -> str:
        """
        If this is a URL that indicates that a video
        is playing or paused, return the video id.
        """
        return ''

    def media_download_url(self) -> str:
        """
        Many media services generate URLs that contain the Media ID,
        but aren't the actual _download_ URL. So extract the Media ID
        and generate the URL so that the URL "just works" when users
        try to classify a video.
        """
        if media_id := self.media_id():
            return self.base_media_download_url.format(media_id=media_id)
        return ''

    def all(self) -> UrlDetails:
        """
        Display all details that can be extracted from the URL
        """
        return UrlDetails(
            default_mimetype=self.default_mimetype,
            domain_name=self.domain_name,
            extension=self.path_extension,
            hostname=self.hostname,
            params=self.query_params,
            search_term=self.search_term,
            title=self.title,
        )
