import duckdb import gradio as gr con = duckdb.connect(":memory:") def greet(SQL_Query): if "limit" not in SQL_Query.lower(): raise gr.Error("You should use the LIMIT clause or it may take too much time to run your query. For example: ```LIMIT 10000```") df = con.sql(SQL_Query).df() if len(df.columns) > 0: # truncate long strings df = df.apply(lambda x: x.apply(lambda y: y[:150 // len(df.columns)] + "..." if isinstance(y, str) and len(y) > 150 // len(df.columns) else y)) return df examples = [ "SELECT * FROM 'hf://datasets/HuggingFaceFW/fineweb/sample/10BT/*.parquet' LIMIT 10;", "SELECT text, language_score FROM 'hf://datasets/HuggingFaceFW/fineweb/sample/10BT/*.parquet' WHERE language_score > 0.97 LIMIT 10;", "SELECT text, language_score FROM 'hf://datasets/HuggingFaceFW/fineweb/sample/10BT/*.parquet' WHERE language_score < 0.67 LIMIT 10;", "SELECT dump, min(language_score), avg(language_score), max(language_score) FROM\n(SELECT * FROM 'hf://datasets/HuggingFaceFW/fineweb/sample/10BT/*.parquet' LIMIT 10000)\nGROUP BY dump;", "SELECT text, language_score FROM 'hf://datasets/HuggingFaceFW/fineweb/sample/10BT/*.parquet' WHERE text SIMILAR TO '([A-Z ]){4,}.*' LIMIT 10;", "SELECT dump, min(token_count), avg(token_count), max(token_count) FROM\n(SELECT * FROM 'hf://datasets/HuggingFaceFW/fineweb/sample/10BT/*.parquet' LIMIT 10000)\nGROUP BY dump;", ] css = "#component-4{display: block;}" description = "Run SQL queries on the HuggingFaceFW/fineweb dataset" demo = gr.Interface(fn=greet, inputs="text", outputs="dataframe", examples=examples, cache_examples=False, description=description, css=css) demo.launch()