parsertenders/tgbot/services/tender_plan.py
2024-11-03 21:16:44 +07:00

588 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import glob
import requests
import json
import pandas as pd
import re
from thefuzz import fuzz # type: ignore
from bs4 import BeautifulSoup
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from datetime import datetime
import time
import asyncio
from aiohttp import ClientSession
from tgbot.utils.misc.bot_logging import bot_logger
from tgbot.services.parser_tendors import get_articles
cookies = {
'jwt': 's%3ABearer%200b998917d77807264c0b82d9ff64ca6fcefb01437019151eff52a4e706ebcd405099a030761ba0f5c6574aea12c525293c7da298b267c462224502a8c88c5289.p%2FACkoaEjBIK4u5vArNpU3Fh24DqBsyfGcHN8h%2BILig',
'referer': 'https://tenderplan.ru/app',
'source': 'response_type=code&client_id=619e606a7883684e0e3d10c7&redirect_uri=https%253A%252F%252Fbitrix24.tenderplan.ru%252Ftenderplan%252Foauth&scope=resources%253Aexternal%2520comments%2520marks%253Aread%2520notes%2520relations%253Aread%2520firm%253Aread&state=2f8e5161-d7e0-4e88-9e37-df3f1f75f20a',
'__ddg1_': 'ZKa7JlUseYuy3cvawO9W',
}
headers = {
'Accept': '*/*',
'Authorization': 'Bearer 0b998917d77807264c0b82d9ff64ca6fcefb01437019151eff52a4e706ebcd405099a030761ba0f5c6574aea12c525293c7da298b267c462224502a8c88c5289',
'Sec-Fetch-Site': 'same-origin',
'Accept-Language': 'ru',
# 'Accept-Encoding': 'gzip, deflate, br',
'Sec-Fetch-Mode': 'cors',
'Host': 'tenderplan.ru',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15',
'Connection': 'keep-alive',
'Referer': 'https://tenderplan.ru/app',
# 'Cookie': 'jwt=s%3ABearer%200b998917d77807264c0b82d9ff64ca6fcefb01437019151eff52a4e706ebcd405099a030761ba0f5c6574aea12c525293c7da298b267c462224502a8c88c5289.p%2FACkoaEjBIK4u5vArNpU3Fh24DqBsyfGcHN8h%2BILig; referer=https://tenderplan.ru/app; source=response_type=code&client_id=619e606a7883684e0e3d10c7&redirect_uri=https%253A%252F%252Fbitrix24.tenderplan.ru%252Ftenderplan%252Foauth&scope=resources%253Aexternal%2520comments%2520marks%253Aread%2520notes%2520relations%253Aread%2520firm%253Aread&state=2f8e5161-d7e0-4e88-9e37-df3f1f75f20a; __ddg1_=ZKa7JlUseYuy3cvawO9W',
'Sec-Fetch-Dest': 'empty',
}
# def search_in_tenderplan():
# https://tenderplan.ru/api/tenders/getlist?q=F-714117
# https://tenderplan.ru/api/tenders/getlist?page=1&q=Komatsu
# ...
def get_urls(article = 0):
urls = []
if (article == 0):
article = get_articles()
for val in article.iloc:
# print(val["Артикул"])
for art in val["Артикул"]:
urls.append({"article": f"{val['Наименование']} / {art}", "art": art, "url": f"https://tenderplan.ru/api/tenders/getlist?isActual=1&q={art}"})
else:
articles = article.split(", |,")
for art in articles:
urls.append({"article": f"{art}", "url": f"https://tenderplan.ru/api/tenders/getlist?isActual=1&q={art}"})
return urls
# async def fetch(url, session):
# st = ''
# try:
# async with session.get(url['url']) as response:
# k = 0
# st = response.status
# while response.status != 200:
# time.sleep(5)
# print('sleeep')
# k += 1
# if k > 5: break
# date = response.headers.get("DATE")
# print(f"{date}:{response.url} with status {response.status}")
# data = {'url': url, 'response': await response.json()}
# return data
# except Exception as e:
# print(e)
# bot_logger.error(f"st:{st} --- {e}")
# async def bound_fetch(sem, url, session):
# # Getter function with semaphore.
# async with sem:
# return await fetch(url, session)
async def fetch(url, session, retry_event):
st = ''
try:
async with session.get(url['url']) as response:
k = 0
st = response.status
while response.status == 429:
await retry_event.wait() # Ожидаем разрешения продолжения запросов
async with session.get(url['url']) as response:
st = response.status
while response.status != 200 and k <= 5:
await asyncio.sleep(5) # Асинхронная задержка
print('sleeep')
k += 1
async with session.get(url['url']) as response:
st = response.status
if response.status == 200:
date = response.headers.get("DATE")
print(f"{date}: {response.url} со статусом {response.status}")
data = {'url': url, 'response': await response.json()}
return data
else:
print(f"Не удалось получить {url['url']} после {k} попыток")
return None
except Exception as e:
print(e)
bot_logger.error(f"st: {st} --- {e}")
async def bound_fetch(sem, url, session, retry_event):
# Функция получения данных с семафором
async with sem:
return await fetch(url, session, retry_event)
# async def get_tenders_from_url(tender_state = 1):
# urls = get_urls(tender_state)
# return await search_tenders(urls)
# async def get_tenders_from_article(article):
# urls = get_urls(article = article)
# return await search_tenders(urls)
def sooup(tenders_id, tenders, res):
for tender in tenders:
tend_name = tender.get('orderName')
tend_id = tender.get('_id')
good_count = ''
submissionCloseDateTime = tender.get('submissionCloseDateTime')
date_until = datetime.fromtimestamp(submissionCloseDateTime/1000).strftime('%Y-%m-%d')
params = {'id': tend_id,}
response = requests.get('https://tenderplan.ru/api/tenders/get', params=params, cookies=cookies, headers=headers)
if "Количество" in json.loads(response.json().get('json')).get("0").get("fv").get("0").get("fv").get("th").get("1").get("fv"):
goods = json.loads(response.json().get('json')).get("0").get("fv").get("0").get("fv").get("tb")
for good in goods:
good_name = goods.get(good).get('0').get('fv')
if res.get('url').get('art') in good_name:
good_count = goods.get(good).get('1').get('fv')
print(f"name - {good_name}")
print(tend_id, date_until)
# поиск цены в таблицах
price = []
path = "tgbot/data/"
abs_path = os.path.abspath(path)
# Поиск и вывод файлов
excel_files = [name for name in glob.glob(f'{abs_path}/price*.xls*')]
# print(f'excel_files-{excel_files}')
for file in excel_files:
# print(f'file-{file}')
# Загрузка Excel файла
excel_file = file
df = pd.read_excel(excel_file)
search_term = res.get("url").get("art")
if search_term.isalpha():
continue
# Поиск строк, содержащих текст запроса
sear = df[df.apply(lambda row: row.astype(str).str.contains(search_term, case=False).any(), axis=1)].to_dict('index')
print("sear ",sear)
price.append({file.split('/')[-1]: sear})
print("price ",price)
# price - ['Артикул','Бренд','Кол-во','Цена, руб. с НДС']
for pri in price:
print("pri - ", sear)
for keyprice, valprice in pri.items():
for p in valprice.values():
print("p - ", p)
tenders_id.append({
"article": res.get('url').get('article'),
"art0": res.get('url').get('art'),
# "price": price,
"файл": keyprice,
"Артикул": p.get("Артикул"),
"Бренд": p.get("Бренд"),
"Кол-во": p.get("Кол-во"),
"Цена": p.get("Цена, руб. с НДС"),
"good_count": good_count,
"id_tender": tend_id,
"url_tender": f"https://tenderplan.ru/app?tender={tend_id}",
"date_until": date_until,
"tend_name": tend_name,
"platform": response.json().get('platform').get('name'),
"href": response.json().get('href'),
})
else:
tenders_id.append({
"article": res.get('url').get('article'),
"art0": res.get('url').get('art'),
# "price": price,
"файл": '--',
"Артикул": '--',
"Бренд": '--',
"Кол-во": '--',
"Цена": '--',
"good_count": good_count,
"id_tender": tend_id,
"url_tender": f"https://tenderplan.ru/app?tender={tend_id}",
"date_until": date_until,
"tend_name": tend_name,
"platform": response.json().get('platform').get('name'),
"href": response.json().get('href'),
})
# except Exception as e:
# print(e)
# pass
return tenders_id
async def search_in_tenderplan(urls = 0):
tenders_id = []
try:
if urls == 0:
urls = get_urls()
else:
return 0
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(3)
retry_event = asyncio.Event()
retry_event.set() # Устанавливаем событие в начальное состояние
results = []
t = time.time()
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession(cookies=cookies, headers=headers) as session:
tasks = [bound_fetch(sem, url, session, retry_event) for url in urls]
while True:
results1 = await asyncio.gather(*tasks, return_exceptions=True)
for result in results1:
if isinstance(result, Exception):
continue
if result and result.get('response').get('status') == 429:
retry_event.clear() # Останавливаем отправку запросов при 429
await asyncio.sleep(5) # Ждем некоторое время перед повторной попыткой
retry_event.set() # Разрешаем отправку запросов снова
if all(result and result['response']['status'] == 200 for result in results if not isinstance(result, Exception)):
break # Завершаем, если все запросы успешны
# responses = asyncio.gather(*tasks)
results = results1
print(time.time()-t)
print(len(results))
t1 = time.time()
for res in results:
try:
if res.get('response').get('tenders'):
tenders = res.get('response').get('tenders')
#pagination
# if len(tenders) > 50:
# page = 1
# urls2 = f"https://tenderplan.ru/api/tenders/getlist?page={page}&q={}"
# while len(tenders) > 50:
# response = requests.get(ur)
# soup1 = BeautifulSoup(response.json, "html.parser")
# tenders_id = sooup(soup1, tenders_id, res)
########
tenders_id = sooup(tenders_id, tenders, res)
else:
print('tenders none')
# bot_logger.error(f"tenders none, скорее всего 429 ошибка")
# raise Exception("tenders none")
except Exception as e:
print(e)
bot_logger.error(f"{e}")
print(time.time() - t1)
for tend in tenders_id:
print(tend)
get_excel_from_tenderplan(tenders_id)
return tenders_id
except Exception as e:
print(e)
get_excel_from_tenderplan(tenders_id)
bot_logger.error(f"{e}")
def get_excel_from_tenderplan(tenders_id, link = 'tgbot/data/tenders_tenderplan_from_art.xlsx'):
# oldtends = pd.DataFrame(link)
# newtends = pd.DataFrame(tenders_id)
# Предположим, ваш список с тендерами называется ``
df = pd.DataFrame(tenders_id)
df = df.drop_duplicates(ignore_index=True)
# Читаем существующий Excel файл, если он есть
try:
existing_df = pd.read_excel(link, engine='openpyxl')
except FileNotFoundError:
existing_df = pd.DataFrame()
# Объединяем DataFrame с помощью merge, используя indicator=True
merged_df = df.merge(existing_df, how='left', indicator=True)
# Определяем строки, которые есть в ОБЕИХ таблицах
in_both_df = merged_df[merged_df['_merge'] == 'both']
# Сохраняем в Excel с выделением нужных строк красным
writer = pd.ExcelWriter(link, engine='xlsxwriter')
df.to_excel(writer, sheet_name='Tenders', index=False)
# Получаем объект workbook
workbook = writer.book
# Получаем объект worksheet
worksheet = workbook.get_worksheet_by_name('Tenders')
# Применяем стиль к строкам, которые есть в ОБЕИХ таблицах
for index in in_both_df.index:
worksheet.conditional_format(index+1, 0, index+1, len(df.columns)-1, {
'type': 'no_errors',
'format': workbook.add_format({'bg_color': '#FFC7CE'})
})
# Сохраняем файл
writer.close()
# newtends.to_excel(link)
######################################--------AUTOPITER-------########################################
def split_search(search_string):
search_split = []
digit_count = sum(char.isdigit() for char in search_string)
if digit_count <= 1:
return []
for word in search_string.split(" "):
digit_count = sum(char.isdigit() for char in word)
if (
digit_count >= len(word) // 4
and len(word)>=6
and not(bool(re.search('[а-яА-Я]', word)))
):
search_split.append(word)
# if bool(re.search(r'\d', good_name)):
# continue
return search_split
def search_in_autopiter(search: str):
params = {
'detailNumber': search,
'isFullQuery': 'true',
}
try:
ap_search = search
resp = requests.get('https://autopiter.ru/api/api/searchdetails', params=params)
print("response - ", resp.status_code)
print("good search - ", ap_search)
goodauto = []
if resp.status_code == 200:
params = {'idArticles': [position.get('id') for position in resp.json().get('data').get('catalogs')]}
get_cost_resp = requests.get('https://autopiter.ru/api/api/appraise/getcosts', params=params)
print(get_cost_resp)
for position in resp.json().get('data').get('catalogs'):
# print("position ",position)
ap_name = position.get('name')
ap_id = position.get('id')
ap_number = position.get('number')
if get_cost_resp.status_code == 200:
ap_originalPrice = [cost.get('originalPrice') for cost in get_cost_resp.json().get('data') if cost.get('id') == ap_id and cost.get('originalPrice') > 0]
if ap_originalPrice == []: continue
else: ap_originalPrice = ap_originalPrice[0]
else:
ap_originalPrice = "--"
# print("ap_originalPrice - ",ap_originalPrice)
# print("descr совпадение - ",fuzz.partial_token_sort_ratio(name, descr))
goodauto.append({
'ap_search': ap_search,
'ap_name': ILLEGAL_CHARACTERS_RE.sub(r'', ap_name),
'fuzz': fuzz.partial_token_sort_ratio(ap_search, ap_name),
'ap_number': ap_number,
'ap_originalPrice': ap_originalPrice,
'link_autopiter': f"https://autopiter.ru/goods/{ap_number}/{position.get('catalogUrl')}/id{ap_id}",
'ap_id': ap_id,
})
elif resp.status_code == 429:
raise Exception("429")
if goodauto != []:
goodauto = pd.DataFrame(goodauto).sort_values(by="fuzz",ascending=False)[:1]
elif goodauto == []:
goodauto = [{
'ap_search': '----',
'ap_name': '----',
'fuzz': '',
'ap_number': '',
'ap_originalPrice': '',
'link_autopiter': '----',
}]
return goodauto
# goodsinauto.append(goodsinauto)
# print("name - ",search_in_autopiter(name))
# amount = float(g.get("amount"))
# if tend.get('article').split("/")[1] in name:
# goods_name += name + " "
# goods_amount += amount
except Exception as e:
print(e)
bot_logger.error(f"{e}")
return []
def tenders_with_goods(pagecount: int = 1):
try:
tenders_with_goods = []
count = 0
page = 0
countn = 50
while True:
try:
params = {'page': page}
response = requests.get('https://tenderplan.ru/api/tenders/getlist', params=params, cookies=cookies, headers=headers)
# if (response.json().get('tenders') == []):
# break
if (page>=pagecount):
break
# if (count>countn):
# break
page += 1
tenders = response.json().get('tenders')
print(f"--------{len(tenders)}")
for tend in tenders:
try:
tend_name = tend.get('orderName')
id = tend.get('_id')
print(f"tender -- {tend_name} -- {id}")
params = {
'id': id,
}
response = requests.get('https://tenderplan.ru/api/tenders/get', params=params, cookies=cookies, headers=headers)
if "ObjectInfo" in response.json().get('json'):
goods = json.loads(response.json().get('json'))["0"]["fv"]["0"]["fv"]["tb"]
submission_close_timestamp = int(json.loads(response.json().get('json'))['1']['fv']['1']['fv'])
print(submission_close_timestamp)
submission_close_datetime = datetime.fromtimestamp(submission_close_timestamp/1000).strftime('%Y-%m-%d %H:%M:%S')
for good in goods:
# if (count>countn):
# break
good_name = goods.get(good).get('0').get('fv')
print(f"name - {good_name}")
# count += 1
for name in split_search(good_name):
ap_search = search_in_autopiter(name)
for ap_s in ap_search.iterrows():
# print(ap_s)
count += 1
print("count",count)
tenders_with_goods.append({
"tend_name": tend_name,
"tend_link": f"https://tenderplan.ru/app?tender={id}",
"tend_under": submission_close_datetime,
"good_name": good_name,
"ap_good_name": name,
"ap_search_name": ap_s[1].get('ap_name'),
"ap_search_fuzz": ap_s[1].get('fuzz'),
"ap_search_link": ap_s[1].get('link_autopiter'),
"ap_id": ap_s[1].get('ap_id'),
"ap_search_price": ap_s[1].get('ap_originalPrice'),
})
# break
except Exception as e1:
print("error -- ",e1)
bot_logger.error(f"{e1}")
except Exception as e:
print(e)
bot_logger.error(f"{e}")
print(f"count - {count}")
# for twg in tenders_with_goods:
# print(twg)
print(f"excel!!!!!!!!!")
tenders_with_goods = get_all_price(tenders_with_goods)
tends = pd.DataFrame(tenders_with_goods)
tends.to_excel(r'tgbot/data/tenders_with_goods.xlsx')
print(f"excel!!!!!!!!!2")
return tenders_with_goods
except Exception as e:
print(e)
bot_logger.error(f"{e}")
def get_all_price(tenders_with_goods):
ap_ids = []
c = 0
tenders_with_goods2 = []
for ap in tenders_with_goods:
ap_ids.append(ap.get('ap_id'))
params = {'idArticles': ap_ids}
get_cost_resp = requests.get('https://autopiter.ru/api/api/appraise/getcosts', params=params)
while get_cost_resp.status_code == 429:
time.sleep(5)
c += 1
get_cost_resp = requests.get('https://autopiter.ru/api/api/appraise/getcosts', params=params)
if c>10: break
print(get_cost_resp.status_code)
cost_resp = get_cost_resp.json().get('data')
for ap in tenders_with_goods:
try:
id = ap.get("ap_id")
cost = [cost.get('originalPrice') for cost in cost_resp if cost.get('id') == id]
if cost:
ap['ap_search_price'] = cost[0]
tenders_with_goods2.append(ap)
except Exception as e:
print(e)
bot_logger.error(f"{e}")
return tenders_with_goods2
def beautitext(text: str):
# texts = re.findall(r'\w', text)
tt = []
texts = text.split(" ")
for te in texts:
digit_count = sum(char.isdigit() for char in te)
if digit_count >= len(te) // 2:
tt.append(te)
# if bool(re.search(r'\d', te)):
# tt.append(te)
return str(tt)
# print(beautitext('Помпа ЗМЗ-406, 409 /Евро-3, Евро-4'))
# tenders_with_goods()
# print(search_in_autopiter('Форсунка топливная Common Rail КАМАЗ'))
# curl 'https://tenderplan.ru/api/tenders/getlist?' \
# -X 'GET' \
# -H 'Accept: */*' \
# -H 'Authorization: Bearer f7dcac67acdb2a348f5c81fc26cfafaba892a7bc02dceb97ffe079ad60b0edb4399522590c067b24459b785fa019006943700cc47033ca26b7aecd22f3777077' \
# -H 'Sec-Fetch-Site: same-origin' \
# -H 'Accept-Language: ru' \
# -H 'Accept-Encoding: gzip, deflate, br' \
# -H 'Sec-Fetch-Mode: cors' \
# -H 'Host: tenderplan.ru' \
# -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15' \
# -H 'Referer: https://tenderplan.ru/app?key=0&tender=6628ae9952e24fc13583dd05' \
# -H 'Connection: keep-alive' \
# -H 'Cookie: jwt=s%3ABearer%20f7dcac67acdb2a348f5c81fc26cfafaba892a7bc02dceb97ffe079ad60b0edb4399522590c067b24459b785fa019006943700cc47033ca26b7aecd22f3777077.UH%2BcCzOylTzLr%2BF6Hf4kerem6GuMoVK%2FBSiOYmPCkEc; source=key=0&tender=6628ae9952e24fc13583dd05; previousUrl=tenderplan.ru%2Fbitrix24%2Finstructions%2F; tildasid=1713941269630.966260; tildauid=1713888711831.359844; referer=https://tenderplan.ru/app; __ddg1_=ZKa7JlUseYuy3cvawO9W' \
# -H 'Sec-Fetch-Dest: empty' \
# -H 'Socket: ZTvzcuitnjsM-m4OBCP4'