Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add edit leaderboard + citation #3661

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
13 changes: 13 additions & 0 deletions fastchat/serve/gradio_web_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@
enable_moderation = False
use_remote_storage = False

default_citation_md = """
### Citation
Please cite the following paper if you find our leaderboard or dataset helpful.
```
@misc{chiang2024chatbot,
title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},
year={2024},
eprint={2403.04132},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
acknowledgment_md = """
### Terms of Service

Expand Down
117 changes: 90 additions & 27 deletions fastchat/serve/monitor/copilot_arena.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@
from fastchat.serve.monitor.monitor import recompute_final_ranking

copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL")
copilot_arena_citation_md = """
### Citation
Please cite the following paper if you find our leaderboard helpful.
```
@misc{chi2025copilot,
title={Copilot Arena: A Platform for Code LLM Evaluation in the Wild},
author={Wayne Chi and Valerie Chen and Wei-Lin Chiang and Anastasios N. Angelopoulos and Aditya Mittal and Naman Jain and Tianjun Zhang and Ion Stoica and Chris Donahue and Ameet Talwalkar}
year={2025},
}
"""


def process_copilot_arena_leaderboard(leaderboard):
Expand All @@ -31,41 +41,88 @@ def process_copilot_arena_leaderboard(leaderboard):
by=["Rank* (UB)", "score"], ascending=[True, False]
)

leaderboard = leaderboard.rename(
columns={
"name": "Model",
"confidence_interval": "Confidence Interval",
"score": "Arena Score",
"organization": "Organization",
"votes": "Votes",
}
)

column_order = [
"Rank* (UB)",
"Model",
"Arena Score",
"Confidence Interval",
"Votes",
"Organization",
]
leaderboard = leaderboard[column_order]

return leaderboard


def make_copilot_arena_leaderboard_md(leaderboard, interaction_mode):
num_models = len(leaderboard)
total_battles = int(leaderboard["Votes"].sum()) // 2
space = "   "
leaderboard_md = f"""### {interaction_mode}
#### {space} #models: **{num_models}** {space} #votes: **{"{:,}".format(total_battles)}** {space}
"""
return leaderboard_md


def build_copilot_arena_tab():
response = requests.get(copilot_arena_leaderboard_url)
if response.status_code == 200:
leaderboard = pd.DataFrame(response.json()["elo_data"])
leaderboard = process_copilot_arena_leaderboard(leaderboard)
leaderboard = leaderboard.rename(
columns={
"name": "Model",
"confidence_interval": "Confidence Interval",
"score": "Arena Score",
"organization": "Organization",
"votes": "Votes",
}
response_json = response.json()

def update_copilot_arena_leaderboard(interaction_mode):
if interaction_mode == "Code Completion":
leaderboard = pd.DataFrame(response_json["elo_data"])
else:
leaderboard = pd.DataFrame(response_json["edit_elo_data"])
leaderboard = process_copilot_arena_leaderboard(leaderboard)
leaderboard_df = gr.DataFrame(
leaderboard,
datatype=["str" for _ in leaderboard.columns],
elem_id="arena_hard_leaderboard",
height=600,
wrap=True,
interactive=False,
column_widths=[70, 130, 60, 80, 50, 80],
)

md = make_copilot_arena_leaderboard_md(leaderboard, interaction_mode)
leaderboard_md = gr.Markdown(md, elem_id="leaderboard_markdown")

return leaderboard_df, leaderboard_md

gr.Markdown(
"[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs.",
elem_id="copilot_arena_introduction",
)

column_order = [
"Rank* (UB)",
"Model",
"Arena Score",
"Confidence Interval",
"Votes",
"Organization",
]
leaderboard = leaderboard[column_order]
num_models = len(leaderboard)
total_battles = int(leaderboard["Votes"].sum()) // 2
md = f"""
[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
"""

gr.Markdown(md, elem_id="leaderboard_markdown")
gr.DataFrame(
leaderboard = pd.DataFrame(response_json["elo_data"])
leaderboard = process_copilot_arena_leaderboard(leaderboard)
with gr.Row():
with gr.Column(scale=2):
interaction_mode_dropdown = gr.Radio(
choices=["Code Completion", "Code Edit"],
label="Interaction Mode",
value="Code Completion",
)
vote_data = make_copilot_arena_leaderboard_md(
leaderboard, "Code Completion"
)
with gr.Column(scale=3, variant="panel"):
interaction_mode_details = gr.Markdown(
vote_data, elem_id="interaction_mode_details"
)

leaderboard_df = gr.DataFrame(
leaderboard,
datatype=["str" for _ in leaderboard.columns],
elem_id="arena_hard_leaderboard",
Expand All @@ -83,5 +140,11 @@ def build_copilot_arena_tab():
""",
elem_id="leaderboard_markdown",
)

interaction_mode_dropdown.change(
update_copilot_arena_leaderboard,
inputs=[interaction_mode_dropdown],
outputs=[leaderboard_df, interaction_mode_details],
)
else:
gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")
37 changes: 21 additions & 16 deletions fastchat/serve/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,10 +1038,13 @@ def build_leaderboard_tab(
from fastchat.serve.monitor.copilot_arena import (
build_copilot_arena_tab,
copilot_arena_leaderboard_url,
copilot_arena_citation_md,
)

if copilot_arena_leaderboard_url:
with gr.Tab("Copilot Arena Leaderboard", id=5):
with gr.Tab(
"Copilot Arena Leaderboard", id=5
) as copilot_arena_leaderboard_tab:
build_copilot_arena_tab()
else:
print(
Expand All @@ -1060,27 +1063,29 @@ def build_leaderboard_tab(
else:
pass

from fastchat.serve.gradio_web_server import acknowledgment_md
from fastchat.serve.gradio_web_server import default_citation_md, acknowledgment_md

with gr.Accordion(
"Citation",
open=True,
):
citation_md = """
### Citation
Please cite the following paper if you find our leaderboard or dataset helpful.
```
@misc{chiang2024chatbot,
title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},
year={2024},
eprint={2403.04132},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
gr.Markdown(citation_md, elem_id="leaderboard_markdown")
leaderboard_citation_md = gr.Markdown(
default_citation_md, elem_id="leaderboard_markdown"
)
gr.Markdown(acknowledgment_md, elem_id="ack_markdown")
if copilot_arena_leaderboard_tab:
copilot_arena_leaderboard_tab.select(
fn=lambda: gr.Markdown(copilot_arena_citation_md),
inputs=[],
outputs=[leaderboard_citation_md],
)
for tab in tabs.children:
if (not copilot_arena_leaderboard_tab) or tab != copilot_arena_leaderboard_tab:
tab.select(
fn=lambda: gr.Markdown(default_citation_md),
inputs=[],
outputs=[leaderboard_citation_md],
)

return [md_1] + gr_plots

Expand Down