Spaces:
Sleeping
Sleeping
def Talabat_excel_extract(url): | |
import requests | |
import json | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
from io import BytesIO | |
def extract_choices(item_id, restaurant_id): | |
choice_url = f"https://www.talabat.com/nextMenuApi/v2/branches/{restaurant_id}/menu/{item_id}/choices" | |
response = requests.get(choice_url, headers=headers) | |
if response.status_code == 200: | |
choice_data = response.json() | |
return choice_data | |
else: | |
print("Failed to retrieve choices for item ID:", item_id) | |
return None | |
# url = input("Enter restaurant URL: ") | |
parsed_url = urlparse(url) | |
path_segments = parsed_url.path.split('/') | |
restaurant_id = path_segments[-2] | |
restaurant_name = path_segments[-1] | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' | |
} | |
response = requests.get(url, headers=headers) | |
j = 0 | |
category_name_list = [] | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
script_tag = soup.find('script', id='__NEXT_DATA__') | |
if script_tag: | |
json_content = json.loads(script_tag.string.strip()) | |
menu_data = json_content['props']['pageProps']['initialMenuState']['menuData']['items'] | |
menu_items_list = [] | |
for i,item in enumerate(menu_data): | |
item_id = item['id'] | |
name = item['name'] | |
description = item['description'] | |
price = item['price'] | |
original_image = item['originalImage'] | |
original_section = item['originalSection'] | |
has_choices = item['hasChoices'] | |
if original_section not in category_name_list: | |
category_name_list.append(original_section) | |
j = j+1 | |
Category_position = j | |
else: | |
Category_position = j | |
menu_item = { | |
"Category": original_section, | |
"Category_position": Category_position, | |
"Item_name": name, | |
"Item_position": i+1, | |
"Image": original_image, | |
"description": description, | |
"price": price, | |
"id": item_id | |
} | |
menu_items_list.append(menu_item) | |
if has_choices: | |
choice_data = extract_choices(item_id, restaurant_id) | |
if choice_data: | |
choice_for_item = choice_data["result"].get('choiceForItem', [])[0] # Accessing the first element of the list if exists | |
choice_sections = choice_for_item.get('choiceSections', []) | |
grouped_data = {} | |
for option_group in choice_sections: | |
option_group_name = option_group.get('nm', '') | |
min_quantity = option_group.get('mnq', '') | |
max_quantity = option_group.get('mxq', '') | |
options = option_group.get('ich', []) | |
for option_index, option in enumerate(options, start=1): | |
option_name = option.get('nm', '') | |
option_price = option.get('pr', '') | |
grouped_data.setdefault(option_group_name, { | |
"Option_group_name": option_group_name, | |
"Min_quantity": min_quantity, | |
"Max_quantity": max_quantity | |
}) | |
grouped_data[option_group_name][f"Option_{option_index}_Name"] = option_name | |
grouped_data[option_group_name][f"Option_{option_index}_Price"] = option_price | |
menu_items_list.extend(grouped_data.values()) | |
df = pd.DataFrame(menu_items_list) | |
if 'Max_quantity' in df.columns: | |
max_column_index = df.columns.get_loc('Max_quantity') | |
for i in range(max_column_index + 1, len(df.columns)): | |
df.rename(columns={df.columns[i]: ''}, inplace=True) | |
option_group_name_index = df.columns.get_loc('Option_group_name') | |
for i in range(option_group_name_index, len(df.columns)): | |
df.iloc[:, i] = df.iloc[:, i].shift(-1) | |
df_cleaned = df.dropna(how='all') | |
# excel_file = f"{restaurant_name}_menu.xlsx" | |
# df.to_excel(excel_file, index=False) | |
# print(f"Menu items saved to {excel_file}") | |
excel_file = BytesIO() | |
df_cleaned.to_excel(excel_file, index=False) | |
excel_file.seek(0) # Move to the beginning of the BytesIO stream | |
return excel_file, f"{restaurant_name}_menu.xlsx" | |
else: | |
print("Script tag with id '__NEXT_DATA__' not found.") | |
else: | |
print(f"Failed to get menu items. Status code: {response.status_code}") | |