 # Create the dataset for the chatbox about League of Legends Lore
 Author: **HOANG Caroline**

 The notebook is setup and runnable for Google Collab

## 1. Setup

Chrome Installation

In [None]:
# Install Chrome
!apt-get update -q
!apt-get install -y wget curl unzip
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt --fix-broken install -y

# Install dependencies for Chrome
!apt-get install -y libx11-dev libx11-xcb1 libglu1-mesa libxi6 libgconf-2-4 libnss3 libxss1 libappindicator3-1 libasound2 libxtst6

# Install ChromeDriver
!wget -q https://chromedriver.storage.googleapis.com/113.0.5672.63/chromedriver_linux64.zip
!unzip chromedriver_linux64.zip
!mv chromedriver /usr/bin/chromedriver
!chmod +x /usr/bin/chromedriver

Packages

In [None]:
pip install selenium webdriver-manager

## 2. Retrieve texts from website

###Methodology
1. Get all the champions name from https://leagueoflegends.fandom.com/wiki/List_of_champions and store it as a list using BeautifulSoul. As it also extract the champion role this can be used as extra information. \\
2. For each champion {champion_name} we will go to https://universe.leagueoflegends.com/en_GB/story/champion/{champion_name}/ and extract the story text using BeautifulSoup. The name of the champion will go through a process to fit the url. \\
BeautifulSoup extract the HTML text faster than Selenium. However, it sometimes fail to find the story text (such as Aurora, Ambessa) as it doesn't appear. In that case we will use Selenium.
3. We create a dataframe that include the name of the champion associated with its role and story.

## Step 1: Collect champions name

In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the list of champions
url = "https://leagueoflegends.fandom.com/wiki/List_of_champions"

# Send a GET request to fetch the page
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all td tags with the data-sort-value attribute
champion_td_tags = soup.find_all('td', attrs={'data-sort-value': True})

# Extract the champion names from the data-sort-value attribute
champions = [tag['data-sort-value'] for tag in champion_td_tags]

In [None]:
# Create champions name and role list from the previous extraction
champions_name, champions_role = [], []
for i, value in enumerate(champions):
 if i % 3 == 0:
 champions_name.append(value)
 if i % 3 == 1:
 champions_role.append(value)

## Step 2: Extract its lore

In [None]:
import requests
from bs4 import BeautifulSoup
champions_data = []
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# List of champions
champions = champions_name
# Configure options for headless Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
service = Service(ChromeDriverManager().install())

# Function to scrape champion data and store it
def scrape_champion_data(champion_name):
 # Assuming champion_name is a string
 champion_name = champion_name.replace(" ", "") # Remove spaces
 champion_name = champion_name.replace("'", "") # Remove apostrophes
 url = f"https://universe.leagueoflegends.com/en_GB/story/champion/{champion_name}/"
 print(url)
 # Send a GET request to fetch the page
 response = requests.get(url)
 response.encoding = 'utf-8' # Set the encoding to UTF-8, or use response.apparent_encoding
 # Parse the HTML content using BeautifulSoup
 soup = BeautifulSoup(response.text, 'html.parser')

 # Find the tag with the biography content
 meta_tag = soup.find('meta', attrs={'name': 'description'})
 if meta_tag and champion_name != "ambessa":
 biography_text = meta_tag.get('content')
 champions_data.append({
 "Champion": champion_name,
 "Text": biography_text
 })
 else:
 try:

 browser = webdriver.Chrome(service=service, options=chrome_options)

 # Construct URL for the champion
 url = f"https://universe.leagueoflegends.com/en_GB/story/champion/{champion_name}/"

 # Open the page
 browser.get(url)

 # Wait for the page to load and the desired element to be visible
 WebDriverWait(browser, 10).until(
 EC.presence_of_element_located((By.CLASS_NAME, "root_3nvd.dark_1RHo"))
 )

 # Scroll down to ensure content is loaded
 browser.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);")

 # Extract the text content
 element = browser.find_element(By.CLASS_NAME, "root_3nvd.dark_1RHo")
 text_content = element.text

 # Append the data to the list
 champions_data.append({
 "Champion": champion_name,
 "Text": text_content
 })
 browser.quit()
 except:
 champions_data.append({
 "Champion": champion_name,
 "Text": f"Biography content not found for {champion_name}."
 })
 print("Biography content not found.")
# Loop through the champions and scrape data
for champion in champions:
 scrape_champion_data(champion.lower())


## Step 3: Transform into dataframe

In [None]:
# Create a DataFrame from the list of champions data
df1 = pd.DataFrame(champions_data)

# Rename columns
df1.columns = ['Champion', 'Story']

# Add champions role
df1['Role'] = champions_role

# Undo the champions name changed for url
df1['Champion'] = champions_name

# Save it to a CSV file if you want
d1f.to_csv("champions_data.csv", index=False)