import lxml.html
import requests
import csv,time

# Find which Intel processors support AVX-512.  Saves the supported models to a csv file.   Uses wikipedia pages to get list of Intel ark links, see wikis list.

PRODS="products/"
prod_len = len(PRODS)
def getproductid(ark_url):
    ind = ark_url.find(PRODS)
    s = ark_url[ind + prod_len:]
    length = s.find('/')
    if length == -1: # not found
        length = s.find('.')
    if length == -1:
        length = len(s)
    prod_id = s[:length]
    print("prod_id:",prod_id)
    return prod_id


def stringlist(elem):
    ss = []
    for es in elem.itertext():
        if len(es.strip()) > 0:
            ss.append(es.strip())
    return ss

def get_datapoint(ark_doc, key):
    for i, e in enumerate(ark_doc.xpath('//li')):
        for es in e.itertext():
            if es.strip() == key:
                sl = stringlist(e)
                return sl.pop().replace("Products formerly ",'')

def get_ark_price(ark_url):
    prod_id = getproductid(ark_url)
    print(prod_id)
    price_url = f"https://ark.intel.com/libs/apps/intel/support/ark/recommendedCustomerPrice?ids={prod_id}&mmids=&siteName=ark"
    dtaa = sess.get(price_url).json()
    if dtaa[0]:
        price=dtaa[0]['displayPrice']
        return price.replace('$','')
    else:
        return None

def get_ark_data(ark_url):
    res = sess.get(ark_url)
    ark_doc = lxml.html.fromstring(res.content)
    
    data=[]
    ext = get_datapoint(ark_doc,"Instruction Set Extensions")
    print(ark_url)
    if ext and "512" in ext:
        for key in data_names[:-3]:
            #print(f"_{key}_")
            v = get_datapoint(ark_doc,key)
            data.append(v)
        data.append(ext)
        # prices may be a range, e.g. 511 - 521, split into two columns.        
        prices = get_ark_price(ark_url)
        if prices is None:
            data.append("")
            data.append("")
        else:
            prices = prices.split('-')
            data.append(prices[0].strip())
            if len(prices) > 1:
                data.append(prices[1].strip())
            else:
                data.append("")

        return True, data
    else:
        return False,None

done=["https://en.wikipedia.org/wiki/Ice_Lake_(microprocessor)","https://en.wikipedia.org/wiki/Kaby_Lake",
"https://en.wikipedia.org/wiki/Whiskey_Lake_(microarchitecture)","https://en.wikipedia.org/wiki/Cannon_Lake_(microprocessor)",
"https://en.wikipedia.org/wiki/Tiger_Lake", "https://en.wikipedia.org/wiki/Comet_Lake",
"https://en.wikipedia.org/wiki/Rocket_Lake", "https://en.wikipedia.org/wiki/Alder_Lake", "https://en.wikipedia.org/wiki/Raptor_Lake"]

wikis_skylake=["https://en.wikipedia.org/wiki/Skylake_(microarchitecture)",
"https://en.wikipedia.org/wiki/Cannon_Lake_(microprocessor)",
"https://en.wikipedia.org/wiki/Cascade_Lake_(microarchitecture)",
"https://en.wikipedia.org/wiki/Cooper_Lake_(microarchitecture)",]

wikis=["https://en.wikipedia.org/wiki/Kaby_Lake",
"https://en.wikipedia.org/wiki/Whiskey_Lake_(microarchitecture)",
"https://en.wikipedia.org/wiki/Tiger_Lake",
"https://en.wikipedia.org/wiki/Comet_Lake",]

# DONE: skylake, cascade lake, rocket lake, cooper lake
# coming soon:server/workstation: https://en.wikipedia.org/wiki/Sapphire_Rapids

# what info do we want to print out?
data_names=["Processor Number", "Code Name", "Product Collection", "Vertical Segment", "Launch Date", "Lithography", "Total Cores", "Total Threads", "Max Turbo Frequency","Instruction Set Extensions", "Recommended Customer Price", "Price Max"]

def run_scrape():
    with open('cpu_data.csv', 'w') as myfile:
        wr = csv.writer(myfile)
        wr.writerow(data_names)
        # for wikis
        for wiki in wikis:
            print(wiki)
            res = sess.get(wiki)
            doc = lxml.html.fromstring(res.content)
            for (_, _, link, _) in doc.iterlinks():
                if "ark.intel.com" in link:
                    is_avx_512, data = get_ark_data(link)
                    if is_avx_512:
                        print(data)
                        wr.writerow(data)
                    time.sleep(0.1)

# some kind of resource exhaustion of http connections was happening, now using a session
sess = requests.Session()

run_scrape()
#is_avx_512, data = get_ark_data("https://ark.intel.com/products/126699/Intel-Core-i9-7980XE-X-series-Processor-24_75M-Cache-up-to-4_20-GHz")
#is_avx_512, data = get_ark_data("https://ark.intel.com/content/www/us/en/ark/products/217246.html")
# doesn't support avx-512
#is_avx_512, data = get_ark_data("https://ark.intel.com/content/www/us/en/ark/products/230902/intel-core-i31215ul-processor-10m-cache-up-to-4-40-ghz.html")
#print(data)
# no extensions:
#is_avx_512, data = get_ark_data("https://ark.intel.com/content/www/us/en/ark/products/205686/intel-pentium-processor-6805-4m-cache-up-to-3-00-ghz.html")
#is_avx_512, data = get_ark_data( "https://ark.intel.com/content/www/us/en/ark/products/97527")

# todo: some AMD scraping too?  https://github.com/HQJaTu/Windows-CPU-support-scraper/blob/main/windows11cpus/importer/amd.py

#is_avx_512, data = get_ark_data("https://ark.intel.com/content/www/us/en/ark/products/212277/intel-core-i511500-processor-12m-cache-up-to-4-60-ghz.html")