from lclazy import lazy_re_compile
import re

# For RPZ / IPSet config files, scrub all patterns from
# automated feed lists that match the sites below

BASE_DNS_EXCEPTION_LIST = (
    'adobe.com',
    'archive.org',
    'web.archive.org',
    'appspot.com',
    'azurewebsites.net',
    'baidu.com',
    'cloudflare.com',
    'duckduckgo.com',
    'ddg.com',
    'kagi.com',
    'ib.adnxs.com',  # dead domain but scrub to keep log noise down
    'forms.gle',  # google forms URL shortener for docs.google.com
    'feedproxy.google.com',
    'play.google.com',
    'google.ac',
    'google.ad',
    'google.ae',
    'google.com.af',
    'google.com.ag',
    'google.com.ai',
    'google.al',
    'google.am',
    'google.co.ao',
    'google.com.ar',
    'google.as',
    'google.at',
    'google.com.au',
    'google.az',
    'google.ba',
    'google.com.bd',
    'google.be',
    'google.bf',
    'google.bg',
    'google.com.bh',
    'google.bi',
    'google.bj',
    'google.com.bn',
    'google.com.bo',
    'google.com.br',
    'google.bs',
    'google.bt',
    'google.co.bw',
    'google.by',
    'google.com.bz',
    'google.ca',
    'google.com.kh',
    'google.cc',
    'google.cd',
    'google.cf',
    'google.cat',
    'google.cg',
    'google.ch',
    'google.ci',
    'google.co.ck',
    'google.cl',
    'google.cm',
    'google.cn',
    'g.cn',
    'google.com.co',
    'google.co.cr',
    'google.com.cu',
    'google.cv',
    'google.com.cy',
    'google.cz',
    'google.de',
    'google.dj',
    'google.dk',
    'google.dm',
    'google.com.do',
    'google.dz',
    'google.com.ec',
    'google.ee',
    'google.com.eg',
    'google.es',
    'google.com.et',
    'google.fi',
    'google.com.fj',
    'google.fm',
    'google.fr',
    'google.ga',
    'google.ge',
    'google.gf',
    'google.gg',
    'google.com.gh',
    'google.com.gi',
    'google.gl',
    'google.gm',
    'google.gp',
    'google.gr',
    'google.com.gt',
    'google.gy',
    'google.com.hk',
    'google.hn',
    'google.hr',
    'google.ht',
    'google.hu',
    'google.co.id',
    'google.iq',
    'google.ie',
    'google.co.il',
    'google.im',
    'google.co.in',
    'google.io',
    'google.is',
    'google.it',
    'google.je',
    'google.com.jm',
    'google.jo',
    'google.co.jp',
    'google.co.ke',
    'google.ki',
    'google.kg',
    'google.co.kr',
    'google.com.kw',
    'google.kz',
    'google.la',
    'google.com.lb',
    'google.com.lc',
    'google.li',
    'google.lk',
    'google.co.ls',
    'google.lt',
    'google.lu',
    'google.lv',
    'google.com.ly',
    'google.co.ma',
    'google.md',
    'google.me',
    'google.mg',
    'google.mk',
    'google.ml',
    'google.com.mm',
    'google.mn',
    'google.ms',
    'google.com.mt',
    'google.mu',
    'google.mv',
    'google.mw',
    'google.com.mx',
    'google.com.my',
    'google.co.mz',
    'google.com.na',
    'google.ne',
    'google.com.nf',
    'google.com.ng',
    'google.com.ni',
    'google.nl',
    'google.no',
    'google.com.np',
    'google.nr',
    'google.nu',
    'google.co.nz',
    'google.com.om',
    'google.com.pk',
    'google.com.pa',
    'google.com.pe',
    'google.com.ph',
    'google.pl',
    'google.com.pg',
    'google.pn',
    'google.co.pn',
    'google.com.pr',
    'google.ps',
    'google.pt',
    'google.com.py',
    'google.com.qa',
    'google.ro',
    'google.rs',
    'google.ru',
    'google.rw',
    'google.com.sa',
    'google.com.sb',
    'google.sc',
    'google.se',
    'google.com.sg',
    'google.sh',
    'google.si',
    'google.sk',
    'google.com.sl',
    'google.sn',
    'google.sm',
    'google.so',
    'google.st',
    'google.sr',
    'google.com.sv',
    'google.td',
    'google.tg',
    'google.co.th',
    'google.com.tj',
    'google.tk',
    'google.tl',
    'google.tm',
    'google.to',
    'google.tn',
    'google.com.tr',
    'google.tt',
    'google.com.tw',
    'google.co.tz',
    'google.com.ua',
    'google.co.ug',
    'google.co.uk',
    'google.com',
    'google.com.uy',
    'google.co.uz',
    'google.com.vc',
    'google.co.ve',
    'google.vg',
    'google.co.vi',
    'google.com.vn',
    'google.vu',
    'google.ws',
    'google.co.za',
    'google.co.zm',
    'google.co.zw',
    'goo.gl',
    'accounts.google.com',
    'googleadservices.com',
    'googletagmanager.com',
    'googleusercontent.com',
    'docs.google.com',
    'cloud.google.com',
    'storage.cloud.google.com',
    'bing.com',
    'th.bing.com',
    'microsoft.com',
    'sharepoint.com',
    'apple.com',
    'icloud.com',
    'sharepoint.com',
    'blogspot.com',
    'paypal.com',
    'amazon.com',
    'amazonaws.com',
    's3.amazonaws.com',
    'amzn.to',
    'ebay.com',
    'fedex.com',
    'ups.com',
    'usps.com',
    'signal.org',
    't.me',
    'telegram.com',
    'facebook.com',
    'instagram.com',
    'pinterest.com',
    'tiktok.com',
    'twitter.com',
    'tumblr.com',
    'reddit.com',
    'x.com',
    'youtube.com',
    'youtu.be',
    'vimeo.com',
    'evernote.com',
    'hotmail.com',
    'craigslist.org',
    'bit.ly',
    'bitly.com',
    'tiny.url',
    'tinyurl.com',
    't.co',
    't.ly',
    'outlook.com',
    'wikipedia.org',
    'wikimedia.org',
    'mozilla.org',
    'mapbox.com',
    'dropbox.com',
    'onedrive.com',
    'box.com',
    'wix.com',
    'sandhills.com',
    'sandhillscloud.com',
    'xda-developers.com',
    'teamviewer.com',
    'zoom.com',

    # Cert verifications
    'comodoca.com',
    'digicert.com',
    'entrust.net',
    'pki.goog',
    'sectigo.com',
    'starfieldtech.com',
    'trustwave.com',
    'usertrust.com',
    'verisign.com',

    # OS sites
    'apple.com',
    'debian.org',
    'freebsd.org',
    'netbsd.org',
    'openbsd.org',
    'redhat.com',
    'pfsense.org',
    'linuxmint.com',
    'windows.com',
    'windowsupdate.com',

    # source control repos
    'atlassian.com',
    'bitbucket.org',
    'github.blog',
    'github.co',
    'github.com',
    'github.dev',
    'github.io',
    'gh.io',
    'gitlab.com',
    'sourceforge.net',
    'app.hubspot.com',
)

# Any domain name that ends with these suffixes
# should not be ingested from public feeds.
EXCEPTION_SUFFIXES = (
    '.adobe.com',
    '.amazonaws.com',
    '.doubleclick.net',
    '.google.com',
    '.googleusercontent.com',
    '.googleapis.com',
    '.googleadservices.com',
    '.live.com',
    '.office.com',
    '.office365.com',
    '.telegram.com',
    '.github.io',
    'web.archive.org',
    '.feedburner.com',

    # OS sites
    '.apple.com',
    '.debian.org',
    '.freebsd.org',
    '.netbsd.org',
    '.openbsd.org',
    '.redhat.com',
    '.pfsense.org',
    '.linuxmint.com',
    '.windowsupdate.com',
    '.windows.com',

    # Cert verifications
    '.comodoca.com',
    '.digicert.com',
    '.entrust.net',
    '.pki.goog',
    '.sectigo.com',
    '.starfieldtech.com',
    '.trustwave.com',
    '.usertrust.com',
    '.verisign.com',
)

DNS_EXCEPTION_LIST = set()

for dns in BASE_DNS_EXCEPTION_LIST:
    DNS_EXCEPTION_LIST.add(dns)
    DNS_EXCEPTION_LIST.add(f'www.{dns}')

# Skip these patterns on Malware Patrol domains, as
# they're always blocked via other Redwood categories,
# or shouldn't be considered malware.
SKIP_PATTERNS_MP_DOMAINS = lazy_re_compile(
    regex=r"""
    adult|bdsm\b|\bbdsm|fuck|hentai|mature|porn
    |sex(\b|i|y)|seks
    |casino|gambling|lottery
    # skip sites like archive.torproject.org & archive.stanfordreview.org which should be classified as something other than malware
    |^archive\.  
    # Skip common banking sites
    |\b(americanexpress|capitalone|chase|citi|discover|td|visa|wellsfargo)\.com
    """,
    flags=re.VERBOSE,
)
