Skip to content

ScrapNLearn #109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
Marina502792 opened this issue Feb 5, 2025 · 0 comments
Open

ScrapNLearn #109

Marina502792 opened this issue Feb 5, 2025 · 0 comments

Comments

@Marina502792
Copy link

Marina502792 commented Feb 5, 2025

Этот код автоматически собирает данные о футбольных матчах из интернета, анализирует их и создает модели для предсказания:

Кто выиграет (домашняя команда или гостевая).
Будет ли больше 2,5 голов в матче.
Код использует Selenium для сбора данных, Pandas для их обработки, XGBoost для создания моделей и Flask для создания API, который позволяет получать прогнозы.

import pandas as pd
from selenium import webdriver
from selenium.webdriver.safari.service import Service as SafariService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import random
import logging
import yaml
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from flask import Flask, request, jsonify

Настройка логирования

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler("app.log"),
logging.StreamHandler()
]
)

def load_config(config_path='config.yaml'):
with open(config_path, 'r') as file:
config = yaml.safe_load(file)
return config

def get_match_data(match_url, config):
service = SafariService(config['selenium']['driver_path'])
options = webdriver.SafariOptions()
driver = webdriver.Safari(service=service, options=options)
driver.implicitly_wait(config['selenium']['implicit_wait'])
driver.get(match_url)

try:
    # Имитация поведения человека
    wait_time = random.uniform(2, 5)
    logging.info(f"Waiting for {wait_time} seconds before interacting with the page.")
    time.sleep(wait_time)

    # Навигация по вкладкам для загрузки дополнительных данных
    statistics_tab = WebDriverWait(driver, config['selenium']['wait_time']).until(
        EC.element_to_be_clickable((By.XPATH, '//a[@href="#statistics;1"]'))
    )
    statistics_tab.click()
    wait_time = random.uniform(2, 4)
    logging.info(f"Waiting for {wait_time} seconds after clicking statistics tab.")
    time.sleep(wait_time)

    # Извлечение данных о статистике
    shots_element = WebDriverWait(driver, config['selenium']['wait_time']).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="statBox__cell"][contains(text(), "Shots")]'))
    )
    shots = shots_element.text.strip()

    # Вернуться на вкладку "Summary"
    summary_tab = WebDriverWait(driver, config['selenium']['wait_time']).until(
        EC.element_to_be_clickable((By.XPATH, '//a[@href="#summary"]'))
    )
    summary_tab.click()
    wait_time = random.uniform(1, 3)
    logging.info(f"Waiting for {wait_time} seconds after clicking summary tab.")
    time.sleep(wait_time)

    # Извлечение данных о матче
    home_team_element = WebDriverWait(driver, config['selenium']['wait_time']).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="duelParticipant__home"]//a[@class="participant__participantLink participant__participantLink--team"]//img'))
    )
    home_team = home_team_element.get_attribute('alt')

    away_team_element = WebDriverWait(driver, config['selenium']['wait_time']).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="duelParticipant__away"]//a[@class="participant__participantLink participant__participantLink--team"]//img'))
    )
    away_team = away_team_element.get_attribute('alt')

    score_element = WebDriverWait(driver, config['selenium']['wait_time']).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="detailScore__wrapper"]'))
    )
    score = score_element.text.strip()

    start_time_element = WebDriverWait(driver, config['selenium']['wait_time']).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="duelParticipant__startTime"]'))
    )
    start_time = start_time_element.text.strip()

    # Добавление места проведения матча
    location_element = WebDriverWait(driver, config['selenium']['wait_time']).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="duelParticipant__location"]'))
    )
    location = location_element.text.strip()

except TimeoutException:
    logging.error(f"TimeoutException: Не удалось загрузить данные о матче для {match_url}.")
    driver.quit()
    return None
except NoSuchElementException as e:
    logging.error(f"NoSuchElementException: Элемент не найден - {e} для {match_url}.")
    driver.quit()
    return None
except Exception as e:
    logging.error(f"Unexpected error: {e} для {match_url}.")
    driver.quit()
    return None

driver.quit()

# Парсинг счета
try:
    home_goals, away_goals = map(int, score.split('-'))
except:
    logging.error(f"Error parsing score for {match_url}: {score}")
    return None

return {
    'home_team': home_team,
    'away_team': away_team,
    'home_goals': home_goals,
    'away_goals': away_goals,
    'shots': shots,
    'start_time': start_time,
    'location': location
}

def main():
config = load_config()
match_urls = config['urls']['match_urls']

match_data = []
with ThreadPoolExecutor(max_workers=5) as executor:
    future_to_url = {executor.submit(get_match_data, url, config): url for url in match_urls}
    for future in as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
            if data:
                match_data.append(data)
                logging.info(f"Data successfully extracted for {url}.")
            else:
                logging.warning(f"No data extracted for {url}.")
        except Exception as e:
            logging.error(f"Error processing {url}: {e}")

if match_data:
    df = pd.DataFrame(match_data)

    # Создание целевой переменной
    df['home_win'] = (df['home_goals'] > df['away_goals']).astype(int)
    df['away_win'] = (df['away_goals'] > df['home_goals']).astype(int)
    df['draw'] = (df['home_goals'] == df['away_goals']).astype(int)
    df['total_goals'] = df['home_goals'] + df['away_goals']
    df['total_goals_over_2.5'] = (df['total_goals'] > 2.5).astype(int)

    # Выбор признаков и целевой переменной
    X = df[['home_goals', 'away_goals', 'shots', 'location']]
    y_home_win = df['home_win']
    y_total_goals_over_2_5 = df['total_goals_over_2_5']

    # Разделение данных на обучающую и тестовую выборки
    X_train, X_test, y_train_home, y_test_home = train_test_split(X, y_home_win, test_size=config['model']['test_size'], random_state=config['model']['random_state'])
    X_train, X_test, y_train_total, y_test_total = train_test_split(X, y_total_goals_over_2_5, test_size=config['model']['test_size'], random_state=config['model']['random_state'])

    # Обучение модели для прогнозирования победителя с использованием XGBoost
    model_home_win = XGBClassifier(random_state=config['model']['random_state'])
    model_home_win.fit(X_train, y_train_home)

    # Оценка модели
    accuracy_home = model_home_win.score(X_test, y_test_home)
    logging.info(f"Accuracy for home win prediction (XGBoost): {accuracy_home}")

    # Кросс-валидация для модели победителя
    cv_scores_home = cross_val_score(model_home_win, X, y_home_win, cv=5, scoring='accuracy')
    logging.info(f"Cross-validation accuracy scores for home win prediction: {cv_scores_home}")
    logging.info(f"Mean cross-validation accuracy for home win prediction: {cv_scores_home.mean()}")

    # Обучение модели для прогнозирования тотала голов с использованием XGBoost
    model_total_goals = XGBClassifier(random_state=config['model']['random_state'])
    model_total_goals.fit(X_train, y_train_total)

    # Оценка модели
    accuracy_total = model_total_goals.score(X_test, y_test_total)
    logging.info(f"Accuracy for total goals prediction (XGBoost): {accuracy_total}")

    # Кросс-валидация для модели тотала голов
    cv_scores_total = cross_val_score(model_total_goals, X, y_total_goals_over_2_5, cv=5, scoring='accuracy')
    logging.info(f"Cross-validation accuracy scores for total goals prediction: {cv_scores_total}")
    logging.info(f"Mean cross-validation accuracy for total goals prediction: {cv_scores_total.mean()}")

    # Пример прогнозирования
    new_match = pd.DataFrame({
        'home_goals': [df['home_goals'].iloc[0]],
        'away_goals': [df['away_goals'].iloc[0]],
        'shots': [df['shots'].iloc[0]],
        'location': [df['location'].iloc[0]]
    })

    prediction_home = model_home_win.predict(new_match)
    prediction_total = model_total_goals.predict(new_match)
    logging.info(f"Home win prediction: {'Home team wins' if prediction_home[0] == 1 else 'Away team wins or draw'}")
    logging.info(f"Total goals prediction: {'Over 2.5' if prediction_total[0] == 1 else 'Under 2.5'})

    # Создание API с использованием Flask
    app = Flask(__name__)

    @app.route('/predict_home_win', methods=['POST'])
    def predict_home_win():
        data = request.get_json()
        new_match = pd.DataFrame([data])
        prediction = model_home_win.predict(new_match)
        return jsonify({'home_win': bool(prediction[0])})

    @app.route('/predict_total_goals', methods=['POST'])
    def predict_total_goals():
        data = request.get_json()
        new_match = pd.DataFrame([data])
        prediction = model_total_goals.predict(new_match)
        return jsonify({'total_goals_over_2_5': bool(prediction[0])})

    if __name__ == '__main__':
        app.run(debug=True)
else:
    logging.warning("No data to create DataFrame.")

if name == 'main':
main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant