from selectolax.parser import HTMLParser as SelectoHTMLParser
from .settings import STOP_WORDS
from .domains import extract_domain_name


class HTMLParser:

    def __init__(self, html: str) -> None:
        self.html = html
        self.soup = SelectoHTMLParser(html)
        self.metatags = []

    def get_metatags(self):
        if not self.metatags:
            self.metatags = self.soup.css('meta')
        return self.metatags

    def title(self) -> str:
        try:
            return self.soup.css_first('title').text().strip()
        except AttributeError:
            pass
        return ''

    def keywords(self) -> list:
        """
        Get all keyword and article tags
        """
        keywords = set()

        for tag in self.get_metatags():

            attributes = tag.attributes
            attributes_name = attributes.get('name') or ''
            attributes_property = attributes.get('property') or ''
            attributes_content = attributes.get('content') or ''

            if attributes_name == 'keywords':
                keywords.update([tg.strip() for tg in attributes_content.split(',')])
            elif attributes_property.endswith(':tag'):
                keywords.add(attributes_content.strip())

        return sorted(kw for kw in keywords if len(kw) > 3 and kw.lower() not in STOP_WORDS)

    def description(self) -> str:
        """
        Get default description from metatag, but also check
        to see if OG / Twitter descriptions are specified.
        If so, prefer those fields, as the may be longer.
        """
        description = ''
        for tag in self.get_metatags():

            attributes = tag.attributes
            attributes_name = attributes.get('name') or ''
            attributes_property = attributes.get('property') or ''
            attributes_content = attributes.get('content') or ''

            if attributes_name == 'description':
                description = attributes_content

            # property="twitter:description" / property="og:description"
            if attributes_property.endswith(':description'):
                return attributes_content

        return description

    def visible_text(self) -> list:
        """
        Get list of visible text snippets.
        """
        try:
            snippets = self.soup.css_first('body').text().strip().split('\n')
        except Exception as e:
            print(e)
            return []

        text_lines = [s.strip() for s in snippets if s.strip()]
        return [tl for tl in text_lines if not tl.isnumeric() and tl.lower() not in STOP_WORDS]

    def urls(self) -> list:
        return self.soup.css('a')

    def images(self) -> list:
        return self.soup.css('img')

    def assets(self) -> list:
        urls = []

        for asset in self.soup.css('link'):
            href = asset.attributes.get('href')
            if href:
                urls.append(href)

        for asset in self.soup.css('script'):
            src = asset.attributes.get('src')
            if src:
                urls.append(src)

        return urls

    def extract_unique_domains(self, urls: list) -> list:
        """
        Extract unique domains from list of URLs
        """
        domains = set()

        for url in urls:
            domain = extract_domain_name(url)
            if domain:
                domains.add(domain)

        return sorted(domains)

    def asset_domains(self) -> list:
        """
        Get unique list of all external domains that
        provide CSS / JS / Image assets for this page
        """
        return self.extract_unique_domains(self.assets())

    def external_domains(self) -> list:
        """
        Get unique list of all external domain names
        that this page links to.
        """
        urls = [url.attributes.get('href') for url in self.urls()]
        return self.extract_unique_domains(urls)


__all__ = [
    'HTMLParser',
]
