mirror of
https://github.com/MeexReay/thinkpad-parse.git
synced 2025-06-24 02:22:59 +03:00
first commit
This commit is contained in:
commit
b1b14f0293
195
avito_api.py
Normal file
195
avito_api.py
Normal file
@ -0,0 +1,195 @@
|
||||
from selenium.webdriver import Firefox
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from hashlib import sha256
|
||||
|
||||
def create_driver(
|
||||
headless = True,
|
||||
clear_cookies = False
|
||||
):
|
||||
options = Options()
|
||||
if headless:
|
||||
options.headless = True
|
||||
options.add_argument("--headless")
|
||||
if clear_cookies:
|
||||
options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows; U; Windows NT 10.4;; en-US) AppleWebKit/603.21 (KHTML, like Gecko) Chrome/49.0.3713.352 Safari/603.2 Edge/10.30739")
|
||||
options.set_preference("browser.cache.disk.enable", False)
|
||||
options.set_preference("browser.cache.memory.enable", False)
|
||||
options.set_preference("browser.cache.offline.enable", False)
|
||||
options.set_preference("network.http.use-cache", False)
|
||||
driver = Firefox(options=options)
|
||||
if clear_cookies:
|
||||
driver.delete_all_cookies()
|
||||
driver.get('about:blank')
|
||||
return driver
|
||||
|
||||
def close_driver(driver: Firefox):
|
||||
driver.close()
|
||||
driver.quit()
|
||||
|
||||
def avito_search(
|
||||
driver: Firefox,
|
||||
query: str,
|
||||
page: int,
|
||||
url = "https://www.avito.ru/all/noutbuki?cd=1&p={1}&q={0}",
|
||||
cookie = {}
|
||||
):
|
||||
driver.get(url.format(query, page))
|
||||
for a in [{"name": i[0], "value": i[1]} for i in cookie.items()]:
|
||||
driver.add_cookie(a)
|
||||
driver.get(url.format(query, page))
|
||||
|
||||
objects = []
|
||||
for element in driver.find_elements(By.CSS_SELECTOR, "div[data-marker=\"item\"]"):
|
||||
try:
|
||||
url = element.find_element(By.CSS_SELECTOR, "a[itemprop=\"url\"]").get_attribute("href")
|
||||
name = element.find_element(By.CSS_SELECTOR, "h3[itemprop=\"name\"]").text
|
||||
price, small_description, *_ = map(lambda a: a.text, element.find_elements(By.TAG_NAME, "p"))
|
||||
price = int("".join(list(filter(lambda a: a in "0123456789", price))))
|
||||
path = url.removeprefix("https://www.avito.ru/").split("?")[0]
|
||||
|
||||
objects.append({
|
||||
"url": url,
|
||||
"path": path,
|
||||
"name": name,
|
||||
"price": price,
|
||||
"small_description": small_description,
|
||||
"hash": sha256(path.encode('utf-8')).hexdigest()
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
return objects
|
||||
|
||||
def avito_get_info(
|
||||
driver: Firefox,
|
||||
path: str,
|
||||
url = "https://www.avito.ru/{0}",
|
||||
times = 0
|
||||
):
|
||||
if times == 10: return {}
|
||||
|
||||
params = {}
|
||||
description = "N/A"
|
||||
|
||||
try:
|
||||
driver.get(url.format(path))
|
||||
|
||||
params_div = driver.find_element(By.CSS_SELECTOR, "div[data-marker=\"item-view/item-params\"]")
|
||||
|
||||
for param in params_div.find_elements(By.TAG_NAME, "p"):
|
||||
param_text = param.text
|
||||
if ": " in param_text:
|
||||
k, v = param_text.split(": ")
|
||||
params[k] = v
|
||||
|
||||
description = driver.find_element(By.CSS_SELECTOR, "div[itemprop=\"description\"]").text
|
||||
except NoSuchElementException: pass
|
||||
except Exception:
|
||||
return avito_get_info(driver, path, url, times=times+1)
|
||||
|
||||
return {"params": params, "description": description}
|
||||
|
||||
def get_benchmark_score(
|
||||
driver: Firefox,
|
||||
cpu: str,
|
||||
gpu: str,
|
||||
cpu_url = "https://browser.geekbench.com/search?q={0}",
|
||||
gpu_url = "https://browser.geekbench.com/search?k=v6_compute&q={0}"
|
||||
):
|
||||
cpu_score = None
|
||||
gpu_score = None
|
||||
|
||||
if cpu != None:
|
||||
driver.get(cpu_url.format(cpu.split(",")[0]))
|
||||
|
||||
cpu_score = 0
|
||||
score_size = 0
|
||||
for column in driver.find_elements(By.CLASS_NAME, "list-col-inner"):
|
||||
for score in column.find_elements(By.CLASS_NAME, "list-col-text-score"):
|
||||
cpu_score += int(score.text)
|
||||
score_size += 1
|
||||
if cpu_score != 0 and score_size != 0:
|
||||
cpu_score = int(cpu_score / score_size)
|
||||
else:
|
||||
cpu_score = None
|
||||
|
||||
if gpu != None:
|
||||
driver.get(gpu_url.format(gpu))
|
||||
|
||||
gpu_score = 0
|
||||
score_size = 0
|
||||
for column in driver.find_elements(By.CLASS_NAME, "list-col-inner"):
|
||||
for score in column.find_elements(By.CLASS_NAME, "list-col-text-score"):
|
||||
gpu_score += int(score.text)
|
||||
score_size += 1
|
||||
if gpu_score != 0 and score_size != 0:
|
||||
gpu_score = int(gpu_score / score_size)
|
||||
else:
|
||||
gpu_score = None
|
||||
|
||||
if gpu_score == None:
|
||||
gpu_score = cpu_score
|
||||
|
||||
return {"cpu": cpu_score, "gpu": gpu_score}
|
||||
|
||||
def pack_thinkpad(
|
||||
driver: Firefox,
|
||||
item: dict[str, object],
|
||||
info: dict[str, str],
|
||||
minimal = True
|
||||
):
|
||||
g = lambda x,y,m=lambda a:a: m(x[y]) if y in x else None
|
||||
|
||||
def disk_size(s):
|
||||
t = ""
|
||||
for c in s:
|
||||
if c in "0123456789":
|
||||
t += c
|
||||
else:
|
||||
break
|
||||
return int(t)
|
||||
|
||||
def disk_type(s):
|
||||
t = ""
|
||||
for c in s:
|
||||
if c in "SDH":
|
||||
t += c
|
||||
else:
|
||||
break
|
||||
return t
|
||||
|
||||
data = {
|
||||
"url": item["url"],
|
||||
"hash": item["hash"],
|
||||
"price": item["price"],
|
||||
"name": item["name"],
|
||||
"description": info["description"],
|
||||
"params": info["params"],
|
||||
"state": g(info["params"],"Состояние"),
|
||||
"manufacturer": g(info["params"],"Производитель"),
|
||||
"screen_diagonal": g(info["params"],"Диагональ, дюйм"),
|
||||
"screen_size": g(info["params"],"Разрешение экрана"),
|
||||
"cpu": g(info["params"],"Процессор"),
|
||||
"cpu_cores": g(info["params"],"Количество ядер процессора",int),
|
||||
"ram": g(info["params"],"Оперативная память, ГБ",int),
|
||||
"disk_type": g(info["params"],"Конфигурация накопителей",disk_type),
|
||||
"disk_size": g(info["params"],"Объем накопителей, ГБ",disk_size),
|
||||
"gpu": g(info["params"],"Видеокарта"),
|
||||
"os": g(info["params"],"Операционная система"),
|
||||
}
|
||||
|
||||
data["benchmarks"] = get_benchmark_score(driver, data["cpu"], data["gpu"])
|
||||
|
||||
if minimal:
|
||||
del data["description"]
|
||||
del data["name"]
|
||||
del data["params"]
|
||||
del data["os"]
|
||||
del data["cpu_cores"]
|
||||
del data["screen_diagonal"]
|
||||
del data["state"]
|
||||
del data["manufacturer"]
|
||||
|
||||
return data
|
1
cookies.json
Normal file
1
cookies.json
Normal file
@ -0,0 +1 @@
|
||||
{}
|
11
find_by_hash.py
Normal file
11
find_by_hash.py
Normal file
@ -0,0 +1,11 @@
|
||||
import json
|
||||
|
||||
hash = input("hash > ")
|
||||
|
||||
for item in open("thinkpads.txt", "r").readlines():
|
||||
item = json.loads(item)
|
||||
|
||||
if item["hash"] == hash:
|
||||
print(json.dumps(item, indent=2))
|
||||
|
||||
break
|
44
main.py
Normal file
44
main.py
Normal file
@ -0,0 +1,44 @@
|
||||
from avito_api import *
|
||||
import threading
|
||||
import json
|
||||
import time
|
||||
|
||||
URL = "https://www.avito.ru/all/noutbuki?cd=1&f=ASgCAgECA0XGmgwUeyJmcm9tIjo0MDAwLCJ0byI6MH2coRQUeyJmcm9tIjo0LCJ0byI6bnVsbH2eoRQWeyJmcm9tIjoyMDAsInRvIjpudWxsfQ&q={0}&s=1&p={1}"
|
||||
COOKIE = json.load(open("cookies.json", "r"))
|
||||
THREADS = 10
|
||||
PAGES_PER_THREAD = 10
|
||||
HEADLESS = True
|
||||
|
||||
def on_pack(pack):
|
||||
for i in open("thinkpads.txt", mode="r").readlines():
|
||||
if json.loads(i)["hash"] == pack["hash"]:
|
||||
return
|
||||
open("thinkpads.txt", mode="a").write(json.dumps(pack,ensure_ascii=False).replace("\n","\\n")+"\n")
|
||||
|
||||
def parse_page(page):
|
||||
driver = create_driver(headless = HEADLESS)
|
||||
|
||||
for p in range(1, PAGES_PER_THREAD + 1):
|
||||
p = page * PAGES_PER_THREAD + p
|
||||
|
||||
print(p)
|
||||
|
||||
for item in avito_search(driver, "thinkpad", p, url=URL, cookie=COOKIE):
|
||||
info = avito_get_info(driver, item["path"])
|
||||
pack = pack_thinkpad(driver, item, info)
|
||||
|
||||
print(pack)
|
||||
|
||||
on_pack(pack)
|
||||
|
||||
close_driver(driver)
|
||||
|
||||
threads = []
|
||||
|
||||
for page in range(THREADS):
|
||||
thread = threading.Thread(target=parse_page, args=(page,))
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
42
top_best.py
Normal file
42
top_best.py
Normal file
@ -0,0 +1,42 @@
|
||||
import json
|
||||
import webbrowser
|
||||
|
||||
items = [json.loads(i) for i in open("thinkpads.txt", "r").readlines()]
|
||||
|
||||
avg_ram = 0
|
||||
avg_gpu = 0
|
||||
avg_cpu = 0
|
||||
avg_disk = 0
|
||||
min_price = -1
|
||||
max_price = -1
|
||||
|
||||
for item in items:
|
||||
if item["ram"] != None: avg_ram = (avg_ram + item["ram"]) / 2.0 if avg_ram != 0 else item["ram"]
|
||||
if item["benchmarks"]["cpu"] != None: avg_cpu = (avg_cpu + item["benchmarks"]["cpu"]) / 2.0 if avg_cpu != 0 else item["benchmarks"]["cpu"]
|
||||
if item["benchmarks"]["gpu"] != None: avg_gpu = (avg_gpu + item["benchmarks"]["gpu"]) / 2.0 if avg_gpu != 0 else item["benchmarks"]["gpu"]
|
||||
if item["disk_size"] != None: avg_disk = (avg_disk + item["disk_size"]) / 2.0 if avg_disk != 0 else item["disk_size"]
|
||||
if min_price == -1 or min_price > item["price"]: min_price = item["price"]
|
||||
if max_price == -1 or max_price < item["price"]: max_price = item["price"]
|
||||
|
||||
def get_score(item):
|
||||
quality = int(((item["ram"] / avg_ram) + \
|
||||
(item["benchmarks"]["cpu"] / avg_cpu) + \
|
||||
(item["benchmarks"]["gpu"] / avg_gpu) + \
|
||||
(item["disk_size"] / avg_disk)) / 4 * 1000)
|
||||
|
||||
return ((max_price - (item["price"] - min_price)) / min_price) * 1500 + quality
|
||||
|
||||
def key_filter(item):
|
||||
if item["ram"] == None: return False
|
||||
if item["benchmarks"]["cpu"] == None: return False
|
||||
if item["benchmarks"]["gpu"] == None: return False
|
||||
if item["disk_size"] == None: return False
|
||||
return True
|
||||
|
||||
result = list(filter(key_filter, items))
|
||||
result = sorted(result, key = get_score, reverse=True)
|
||||
|
||||
for item in result:
|
||||
webbrowser.open_new_tab(item["url"])
|
||||
print(f"{item} (score: {get_score(item)})")
|
||||
input()
|
32
top_best_ai.py
Normal file
32
top_best_ai.py
Normal file
@ -0,0 +1,32 @@
|
||||
import ollama
|
||||
|
||||
thinkpads_content = "\n".join(open("thinkpads.txt", "r").readlines()[:10])
|
||||
prompt = """
|
||||
Подбери самый дешевый ноутбук подходящий по предпочтениям из данных
|
||||
Предпочтения:
|
||||
Хороший аккумулятор
|
||||
Среднее кол-во памяти (6+ гб)
|
||||
Тип диска - SSD
|
||||
Вместительность диска - минимум 250 гб
|
||||
Хороший процессор
|
||||
Видеокарта не важна
|
||||
Цена: минимально возможная
|
||||
Клавиатура и тачпад: в нормальном состоянии
|
||||
USB порты: минимум 2
|
||||
Тип системы: не важен
|
||||
Монитор: без требования в ремонте
|
||||
Зарядное устройство: имеется
|
||||
Данные:
|
||||
"""+thinkpads_content
|
||||
|
||||
ollama.pull("llama3.1")
|
||||
|
||||
stream = ollama.generate(
|
||||
model='llama3.1',
|
||||
prompt=prompt,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in stream:
|
||||
print(chunk['response'], end='', flush=True)
|
||||
print()
|
Loading…
x
Reference in New Issue
Block a user