Weyaxi's picture
applymap to map in pandas function
c38f61a verified
raw
history blame contribute delete
No virus
3.02 kB
import os
import gradio as gr
import pandas as pd
import time
import threading
from huggingface_hub import HfApi
from humanize import naturalsize
api = HfApi()
HF_TOKEN = os.getenv('HF_TOKEN')
def clickable(x):
return f'<a target="_blank" href="https://huggingface.co/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>'
def apply_headers(df, headers):
tmp = df.copy()
tmp.columns = headers
return tmp
def search(search_text):
if not search_text:
return df
return df[df['👤 Author'].str.contains(search_text, case=False, na=False)]
df = pd.read_csv("author_data_hf_merged.csv")
df_author_copy = df.copy()
df["author"] = df["author"].apply(lambda x: clickable(x))
df['Total Usage'] = df[['models', 'datasets', 'spaces']].sum(axis=1)
df = df.sort_values(by='Total Usage', ascending=False)
sum_all_author = naturalsize(sum(df['models'].tolist()+df['datasets'].tolist()+df['spaces'].tolist()))
naturalsize_columns = ['Total Usage', 'models', 'datasets', 'spaces']
df[naturalsize_columns] = df[naturalsize_columns].map(naturalsize)
df['Serial Number'] = [i for i in range(1, len(df)+1)]
df = df[['Serial Number', "author", "Total Usage", "models", "datasets", "spaces"]]
df = apply_headers(df, ["🔢 Serial Number", "👤 Author", "⚡️ Total Usage", "🏛️ Models", "📊 Datasets", "🚀 Spaces"])
desc = f"""
🎯 The Leaderboard aims to track authors data usage in 🤗 Huggingface.
## 📄 Information
🛠️ This leaderboard consists of 125k authors scraped from [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard).
These 125k authors have been selected based on their [🤗 Huggingface Leaderboard](https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard) positions:
- 🤖 Top 60k authors in the models category
- 📊 Top 60k authors in the datasets category
- 🚀 Top 50k authors in the spaces category
## 📒 Notes
Note that these numbers may not be entirely accurate due to the following reasons:
- I only calculated the data usage from the main branch and did not include deleted files that cannot be directly seen.
- There may be large datasets/models to which I don't have access (either private or gated).
# 📶 Total Data Usage From All Authors
According to this leaderboard, there is a total of {sum_all_author} of data on this platform.
"""
# Write note maybe?
title = """
<div style="text-align:center">
<h1 id="space-title">💾 Data Leaderboard 💾</h1>
</div>
"""
with gr.Blocks() as demo:
gr.Markdown("""<h1 align="center" id="space-title">💾 Data Leaderboard 💾</h1>""")
gr.Markdown(desc)
with gr.Column(min_width=320):
search_bar = gr.Textbox(placeholder="🔍 Search for a author", show_label=False)
gr_followers = gr.Dataframe(df, interactive=False, datatype=["number", 'markdown', 'number'])
search_bar.submit(fn=search, inputs=search_bar, outputs=gr_followers)
demo.launch()