|
|
|
URL = "http://opencompass.openxlab.space/assets/OpenVLM.json" |
|
RESULTS = 'ShoppingMMLU_overall.json' |
|
SHOPPINGMMLU_README = 'https://raw.githubusercontent.com/KL4805/ShoppingMMLU/refs/heads/main/README.md' |
|
|
|
CITATION_BUTTON_TEXT = r"""@article{jin2024shopping, |
|
title={Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models}, |
|
author={Jin, Yilun and Li, Zheng and Zhang, Chenwei and Cao, Tianyu and Gao, Yifan and Jayarao, Pratik and Li, Mao and Liu, Xin and Sarkhel, Ritesh and Tang, Xianfeng and others}, |
|
journal={arXiv preprint arXiv:2410.20745}, |
|
year={2024} |
|
}""" |
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
|
|
LEADERBORAD_INTRODUCTION = """# Shopping MMLU Leaderboard |
|
### Welcome to Shopping MMLU Leaderboard! On this leaderboard we share the evaluation results of LLMs obtained by the OpenSource Framework: |
|
### [Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models](https://github.com/KL4805/ShoppingMMLU) 🏆 |
|
### Currently, Shopping MMLU Leaderboard covers {} different LLMs and {} main online shopping skills. |
|
|
|
This leaderboard was last updated: {}. |
|
|
|
Shopping MMLU Leaderboard only includes open-source LLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [Shopping MMLU](https://github.com/KL4805/ShoppingMMLU) to support your LLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [email protected] and [email protected]. |
|
""" |
|
|
|
META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MAIN_FIELDS = [ |
|
'Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities' |
|
] |
|
|
|
|
|
|
|
|
|
DEFAULT_BENCH = ['Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities'] |
|
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown'] |
|
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary'] |
|
|
|
|
|
LEADERBOARD_MD = {} |
|
|
|
LEADERBOARD_MD['MAIN'] = f""" |
|
## Included Shopping Skills: |
|
|
|
- Shopping Concept Understanding: Understanding domain-specific short texts in online shopping (e.g. brands, product models). |
|
- Shopping Knowledge Reasoning: Reasoning over commonsense, numeric, and implicit product-product multi-hop knowledge. |
|
- User Behavior Alignment: Modeling heterogeneous and implicit user behaviors (e.g. click, query, purchase). |
|
- Multi-lingual Abilities: Online shopping across marketplaces around the globe. |
|
|
|
## Main Evaluation Results |
|
|
|
- Metrics: |
|
- Avg Score: The average score on all 4 online shopping skills (normalized to 0 - 100, the higher the better). |
|
- Detailed metrics and evaluation results for each skill are provided in the consequent tabs. |
|
""" |
|
|
|
|
|
|
|
LEADERBOARD_MD['Shopping Concept Understanding'] = """ |
|
## Shopping Concept Understanding Evaluation Results |
|
|
|
Online shopping concepts such as brands and product models are domain-specific and not often seen in pre-training. Moreover, they often appear in short texts (e.g. queries, attribute-value pairs) and thus no sufficient contexts are given to help understand them. Hence, failing to understand these concepts compromises the performance of LLMs on downstream tasks. |
|
|
|
The included sub-skills and tasks include: |
|
- **Concept Normalization**: |
|
- Product Category Synonym |
|
- Attribute Value Synonym |
|
- **Elaboration**: |
|
- Attribute Explanation |
|
- Product Category Explanation |
|
- **Relational Inference**: |
|
- Applicable Attribute to Product Category |
|
- Applicable Product Category to Attribute |
|
- Inapplicable Attributes |
|
- Valid Attribute Value Given Attribute and Product Category |
|
- Valid Attribute Given Attribute Value and Product Category |
|
- Product Category Classification |
|
- Product Category Generation |
|
- **Sentiment Analysis**: |
|
- Aspect-based Sentiment Classification |
|
- Aspect-based Review Retrieval |
|
- Aspect-based Review Selection |
|
- Aspect-based Reviews Overall Sentiment Classification |
|
- **Information Extraction**: |
|
- Attribute Value Extraction |
|
- Query Named Entity Recognition |
|
- Aspect-based Review Keyphrase Selection |
|
- Aspect-based Review Keyphrase Extraction |
|
- **Summarization**: |
|
- Attribute Naming from Decription |
|
- Product Category Naming from Description |
|
- Review Aspect Retrieval |
|
- Single Conversation Topic Selection |
|
- Multi-Conversation Topic Retrieval |
|
- Product Keyphrase Selection |
|
- Product Keyphrase Retrieval |
|
- Product Title Generation |
|
""" |
|
|
|
|
|
LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """ |
|
## Shopping Knowledge Reasoning Evaluation Results |
|
|
|
This skill focuses on understanding and applying various implicit knowledge to perform reasoning over products and their attributes. For example, calculations such as the total volume of a product pack require numeric reasoning, and finding compatible products requires multi-hop reasoning among various products over a product knowledge graph. |
|
|
|
The included sub-skills and tasks include: |
|
- **Numeric Reasoning**: |
|
- Unit Conversation |
|
- Product Numeric Reasoning |
|
- **Commonsense Reasoning** |
|
- **Implicit Multi-Hop Reasoning**: |
|
- Product Compatibility |
|
- Complementary Product Categories |
|
- Implicit Attribute Reasoning |
|
- Related Brands Selection |
|
- Related Brands Retrieval |
|
""" |
|
|
|
LEADERBOARD_MD['User Behavior Alignment'] = """ |
|
## User Behavior Alignment Evaluation Results |
|
|
|
Accurately modeling user behaviors is a crucial skill in online shopping. A large variety of user behaviors exist in online shopping, including queries, clicks, add-to-carts, purchases, etc. Moreover, these behaviors are generally implicit and not expressed in text. |
|
|
|
Consequently, LLMs trained with general texts encounter challenges in aligning with the heterogeneous and implicit user behaviors as they rarely observe such inputs during pre-training. |
|
|
|
The included sub-skills and tasks include: |
|
- **Query-Query Relations**: |
|
- Query Re-Writing |
|
- Query-Query Intention Selection |
|
- Intention-Based Related Query Retrieval |
|
- **Query-Product Relations**: |
|
- Product Category Selection for Query |
|
- Query-Product Relation Selection |
|
- Query-Product Ranking |
|
- **Sessions**: |
|
- Session-based Query Recommendation |
|
- Session-based Next Query Selection |
|
- Session-based Next Product Selection |
|
- **Purchases**: |
|
- Product Co-Purchase Selection |
|
- Product Co-Purchase Retrieval |
|
- **Reviews and QA**: |
|
- Review Rating Prediction |
|
- Aspect-Sentiment-Based Review Generation |
|
- Review Helpfulness Selection |
|
- Product-Based Question Answering |
|
""" |
|
|
|
LEADERBOARD_MD['Multi-lingual Abilities'] = """ |
|
## Multi-lingual Abilities Evaluation Results |
|
|
|
Multi-lingual models are desired in online shopping as they can be deployed in multiple marketplaces without re-training. |
|
|
|
The included sub-skills and tasks include: |
|
- **Multi-lingual Shopping Concept Understanding**: |
|
- Multi-lingual Product Title Generation |
|
- Multi-lingual Product Keyphrase Selection |
|
- Cross-lingual Product Title Translation |
|
- Cross-lingual Product Entity Alignment |
|
- **Multi-lingual User Behavior Alignment**: |
|
- Multi-lingual Query-product Relation Selection |
|
- Multi-lingual Query-product Ranking |
|
- Multi-lingual Session-based Product Recommendation |
|
""" |
|
|
|
|