import csv
from lcrequests import Request
from invoke import task
from pathlib import Path
import re

from dalmatian.settings import DALMATIAN_DOMAINS_DIR, REDWOOD_CATEGORY_DIR


@task
def refresh_top_domains(ctx):
    """
    Refresh top domains categories, generated from the Majestic Million service.
    """
    tlds = set()
    cleaned_domains = []
    all_tlds = refresh_top_level_domains()
    majestic_million = refresh_majestic_million()

    with open(majestic_million, newline='') as mm:
        for row in csv.DictReader(mm):
            domain = row['Domain'].strip()
            tlds.add(row['TLD'].strip())
            if SKIP_DOMAINS.search(domain):
                continue
            if domain.startswith('www.'):
                domain = domain[4:]
            # Ensure that no TLDs leak into the list, since
            # they'd match _any_ domain with that TLD!
            if domain in all_tlds:
                continue
            cleaned_domains.append(domain)

    redwood_category = f'{REDWOOD_CATEGORY_DIR}/top'
    write_category_domains(f'{redwood_category}/10k', cleaned_domains[:10_000])
    write_category_domains(f'{redwood_category}/100k', cleaned_domains[10_000:100_000])
    write_category_domains(f'{redwood_category}/million', cleaned_domains[100_000:])
    write_category_domains(DALMATIAN_DOMAINS_DIR, cleaned_domains)

    tld_file = f'{DALMATIAN_DOMAINS_DIR}/top_tlds.txt'
    with open(tld_file, 'w') as df:
        for tld in tlds:
            df.write(f'{tld}\n')

    print(f'Saved {len(tlds)} rows to {tld_file}')


def refresh_majestic_million() -> str:
    """
    Download the latest majestic_million.csv from majestic.com.
    """
    url = 'https://downloads.majestic.com/majestic_million.csv'
    filename = f'{DALMATIAN_DOMAINS_DIR}/majestic_million.csv'

    with Request(url, timeout=1) as session:
        response = session.get()

    if response.status_code == 200:

        with open(filename, 'wb') as file:
            for chunk in response:
                file.write(chunk)

    return filename


def refresh_top_level_domains() -> set[str]:
    """
    Download the latest list of Top Level Domains as a set of strings.
    """
    url = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'

    with Request(url, timeout=1) as session:
        response = session.get()

    if response.status_code == 200:
        tlds = set()

        for tld in response.text.lower().split('\n'):
            if tld.startswith('#'):
                continue
            tlds.add(tld.strip())

        tld_file = f'{DALMATIAN_DOMAINS_DIR}/all_tlds.txt'
        with open(tld_file, 'w') as df:
            tld_list = list(tlds)
            tld_list.sort()
            for tld in tld_list:
                df.write(f'{tld}\n')

        return tlds

    return set()


def write_category_domains(category: str, domains):
    """
    Helper function to save domains to category path.
    """
    Path(category).mkdir(parents=True, exist_ok=True)
    category_file = f'{category}/domains.urllist'

    with open(category_file, 'w') as rc:
        rc.write('score 500\n\n')
        for domain in domains:
            rc.write(f'{domain}\n')

    print(f'Saved {len(domains)} to {category_file}')


SKIP_DOMAINS = re.compile(
    r'''
        \b(bit.ly|bl.ink|d.to|short.io|tinyurl.com)   # link shorteners have no reputational value
        |blogger.com|blogspot|wordpress.com|medium.com|weebly.com|filesusr.com    # Blogging & user-generated content
        |xn--  # skip puny-coded domains
        |adult
        |beer
        |[ck]asin[ao]|gambl[ei]|lotter|poker|slots
        |celeb
        |discreet
        |eroti[ck]
        |escort
        |\bfree
        |googleusercontent.com
        |gaming
        |gossip
        |horror
        |interracial
        |gay|lesb(ian|o)
        |bang(er|\b)|fuck
        |dating|bride|girl|hookup|coed
        |hairy|horny
        |\bincest|incest\b|inceste
        |mature|milf
        |mast[eu]rbat
        |naturis[mt]|naked|naakt|nud(e|o|ist|ie)
        |naughty
        |cupid|nymph|vixen|bitch
        |bdsm|orgasm|porn|xxx|public-?sex
        |\bsex|sex\b|sex-?(y|cam|chat|date|doll|diar|gam|guide|po|nice|shop|ual|vid|toy|tub)
        |big-?tit|titt(y|ie)|boob|bust(y|ie)|frontal
        |puss(y|ie)|cunt|clit(s|or|\b)|vagina
        |penis
        |playboy
        |penthouse
        |punk
        |squirt
        |\bshit|shit\b
        |suckmy
        |tranny
        |tattoo
        |torrent
        |vampire
        |voyeur
        |wicca
    ''',
    re.VERBOSE,
)
