from invoke.context import Context
from invoke.tasks import task
from textblob import TextBlob

from lchttp import HTMLParser
from lchttp.settings import STOP_WORDS
from redwoodctl.language import HTML_TAGS
from redwoodctl.http import download_page
from string import punctuation

mt = str.maketrans({k: " " for k in punctuation})
STOP_WORDS.update(HTML_TAGS)


@task(
    help={
        "url": "Web page URL to on which to perform language analysis",
        "count": "Display word frequency count",
        "nouns": "Display noun phrases",
        "both": "Display both word frequency and noun phrases",
    },
)
def nlp(
    ctx: Context, url: str, count: bool = False, nouns: bool = False, both: bool = False
) -> None:
    """
    Download page and extract word counts or noun phrases.
    """
    if not any([count, nouns, both]):
        print("Specify text features to display (--count / --nouns / --both)")
        return

    response = download_page(url)
    hp = HTMLParser(response.text).visible_text()
    vt = "\n".join(hp)
    analyze(vt, count, nouns, both)


def analyze(vt: str, count: bool, nouns: bool, both: bool) -> None:
    tb = TextBlob(vt)

    if count or both:
        print("Word counts:\n============")
        wc = tb.word_counts
        wcd = {
            k: v
            for k, v in sorted(wc.items(), key=lambda item: (item[1], item[0]))
            if k.lower not in STOP_WORDS and len(k) > 4
        }
        for k, v in wcd.items():
            print(f"{k}: {v}")

    if nouns or both:
        print("\nNoun phrase counts:\n===================")
        # list without dupes:
        nouns_phrases = list(dict.fromkeys(tb.noun_phrases))
        nouns_d1 = {k: tb.noun_phrases.count(k) for k in nouns_phrases if " " in k}
        nouns_d2 = dict(sorted(nouns_d1.items(), key=lambda item: (item[1], item[0])))
        for k, v in nouns_d2.items():
            cp = clean_phrase(str(k))
            if len(cp) > 6 and " " in cp:
                print(f"{cp}: {v}")


def clean_phrase(phrase: str) -> str:
    s = " ".join(phrase.translate(mt).split())
    return s


__all__ = [
    "nlp",
]
