Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import pickle | |
| from prefixspan import PrefixSpan | |
| import gradio as gr | |
| import altair as alt | |
| from plot import Plot | |
| alt.data_transformers.enable("vegafusion") | |
| # from dynabench.task_evaluator import * | |
| BASE_DIR = "db" | |
| MODELS = ['qwenvl-chat', 'qwenvl', 'llava15-7b', 'llava15-13b', 'instructblip-vicuna13b', 'instructblip-vicuna7b'] | |
| VIDEO_MODELS = ['video-chat2-7b','video-llama2-7b','video-llama2-13b','chat-univi-7b','chat-univi-13b','video-llava-7b','video-chatgpt-7b'] | |
| domains = ["imageqa-2d-sticker", "imageqa-3d-tabletop", "imageqa-scene-graph", "videoqa-3d-tabletop", "videoqa-scene-graph"] | |
| domain2folder = {"imageqa-2d-sticker": "2d", | |
| "imageqa-3d-tabletop": "3d", | |
| "imageqa-scene-graph": "sg", | |
| "videoqa-3d-tabletop": "video-3d", | |
| "videoqa-scene-graph": "video-sg", | |
| None: '2d'} | |
| def find_frequent_patterns(k, df, scores=None): | |
| if len(df) == 0: | |
| return [] | |
| df = df.reset_index(drop=True) | |
| cols = df.columns.to_list() | |
| df = df.fillna('').astype('str') | |
| db = [[(c, v) for c, v in zip(cols, d) if v] for d in df.values.tolist()] | |
| ps = PrefixSpan(db) | |
| patterns = ps.topk(k, closed=True) | |
| if scores is None: | |
| return patterns | |
| else: | |
| aggregated_scores = [] | |
| scores = np.asarray(scores) | |
| for count, pattern in patterns: | |
| q = ' and '.join([f"`{k}` == {repr(v)}" for k, v in pattern]) | |
| indices = df.query(q).index.to_numpy() | |
| aggregated_scores.append(np.mean(scores[indices])) | |
| return patterns, aggregated_scores | |
| def update_partition_and_models(domain): | |
| domain = domain2folder[domain] | |
| path = f"{BASE_DIR}/{domain}" | |
| if os.path.exists(path): | |
| partitions = list_directories(path) | |
| if domain.find("video") > -1: | |
| model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model") | |
| else: | |
| model = gr.Dropdown(MODELS, value=MODELS[0], label="model") | |
| partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") | |
| return [partition, model] | |
| else: | |
| partition = gr.Dropdown([], value=None, label="task space of the following task generator") | |
| model = gr.Dropdown([], value=None, label="model") | |
| return [partition, model] | |
| def update_partition_and_models_and_baselines(domain): | |
| domain = domain2folder[domain] | |
| path = f"{BASE_DIR}/{domain}" | |
| if os.path.exists(path): | |
| partitions = list_directories(path) | |
| if domain.find("video") > -1: | |
| model = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="model") | |
| baseline = gr.Dropdown(VIDEO_MODELS, value=VIDEO_MODELS[0], label="baseline") | |
| else: | |
| model = gr.Dropdown(MODELS, value=MODELS[0], label="model") | |
| baseline = gr.Dropdown(MODELS, value=MODELS[0], label="baseline") | |
| partition = gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") | |
| else: | |
| partition = gr.Dropdown([], value=None, label="task space of the following task generator") | |
| model = gr.Dropdown([], value=None, label="model") | |
| baseline = gr.Dropdown([], value=None, label="baseline") | |
| return [partition, model, baseline] | |
| def get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| if not os.path.exists(data_path): | |
| return [] | |
| else: | |
| merged_df = pd.read_csv(data_path) | |
| merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) | |
| df = merged_df | |
| select_top = rank == "top" | |
| # Model X is good / bad at | |
| for model in models: | |
| if baseline: | |
| df = df[df[model] >= df[baseline]] | |
| else: | |
| if select_top: | |
| df = df[df[model] >= threshold] | |
| else: | |
| df = df[df[model] <= threshold] | |
| if not baseline: | |
| df['mean score'] = df[models].mean(axis=1) | |
| df = df.sort_values(by='mean score', ascending=False) | |
| df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] | |
| task_ids = list(df.index) | |
| return task_ids | |
| def plot_patterns(domain, partition, models, rank, k, threshold, baseline, pattern, order): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
| if not os.path.exists(data_path): | |
| return None | |
| task_ids = get_filtered_task_ids(domain, partition, models, rank, k, threshold, baseline) | |
| expand_df = pd.read_csv(data_path) | |
| chart_df = expand_df[expand_df['model'].isin((models + [baseline]) if baseline else models)] | |
| chart_df = chart_df[chart_df['task id'].isin(task_ids)] | |
| print(pattern) | |
| freq, cols = eval(pattern) | |
| pattern_str = "" | |
| df = chart_df | |
| for col in cols: | |
| col_name, col_val = col | |
| try: | |
| col_val = int(col_val) | |
| except: | |
| col_val = col_val | |
| df = df[df[col_name] == col_val] | |
| pattern_str += f"{col_name} = {col_val}, " | |
| print(len(df)) | |
| if baseline: | |
| model_str = (', '.join(models) if len(models) > 1 else models[0]) | |
| phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs' | |
| title = f"{phrase} better than {baseline} on {freq} tasks where {pattern_str[:-2]}" | |
| else: | |
| title = f"Models are {'best' if rank == 'top' else 'worst'} at {freq} tasks where {pattern_str[:-2]}" | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X('model:N', | |
| sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), | |
| axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle), | |
| alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
| alt.Color('model:N').legend(), | |
| ).properties( | |
| width=400, | |
| height=300, | |
| title=title | |
| ) | |
| return chart | |
| def plot_embedding(domain, partition, category): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| if os.path.exists(data_path): | |
| merged_df = pd.read_csv(data_path) | |
| # models = merged_df.columns | |
| has_image = 'image' in merged_df | |
| chart = alt.Chart(merged_df).mark_point(size=30, filled=True).encode( | |
| alt.OpacityValue(0.5), | |
| alt.X('x:Q', title="UMAP Component 1"), | |
| alt.Y('y:Q', title="UMAP Component 2"), | |
| alt.Color(f'{category}:N'), | |
| tooltip=['question', 'answer'] + (['image'] if has_image else []), | |
| ).properties( | |
| width=800, | |
| height=800, | |
| title="UMAP Projected Task Embeddings" | |
| ).configure_axis( | |
| labelFontSize=25, | |
| titleFontSize=25, | |
| ).configure_title( | |
| fontSize=40 | |
| ).configure_legend( | |
| labelFontSize=25, | |
| titleFontSize=25, | |
| ).interactive() | |
| return chart | |
| else: | |
| return None | |
| def plot_multi_models(domain, partition, category, cat_options, models, order, pattern, aggregate="mean"): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
| if not os.path.exists(data_path): | |
| return None | |
| expand_df = pd.read_csv(data_path) | |
| print(pattern) | |
| if pattern is not None: | |
| df = expand_df | |
| freq, cols = eval(pattern) | |
| pattern_str = "" | |
| for col in cols: | |
| col_name, col_val = col | |
| try: | |
| col_val = int(col_val) | |
| except: | |
| col_val = col_val | |
| df = df[df[col_name] == col_val] | |
| pattern_str += f"{col_name} = {col_val}, " | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X('model:N', | |
| sort=alt.EncodingSortField(field=f'score', order='ascending', op="mean"), | |
| axis=alt.Axis(labels=False, tickSize=0)), # no title, no label angle), | |
| alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
| alt.Color('model:N').legend(), | |
| ).properties( | |
| width=200, | |
| height=100, | |
| title=f"How do models perform on tasks where {pattern_str[:-2]} (N={freq})?" | |
| ) | |
| return chart | |
| else: | |
| df = expand_df[(expand_df['model'].isin(models)) & (expand_df[category].isin(cat_options))] | |
| if len(models) > 1: | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X('model:N', | |
| sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), | |
| axis=alt.Axis(labels=False, tickSize=0, title=None)), | |
| alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
| alt.Color('model:N').legend(), | |
| alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) | |
| ).properties( | |
| width=200, | |
| height=100, | |
| title=f"How do models perform across {category}?" | |
| ) | |
| else: | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle), | |
| alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
| alt.Color(f'{category}:N').legend(None), | |
| ).properties( | |
| width=200, | |
| height=100, | |
| title=f"How does {models[0]} perform across {category}?" | |
| ) | |
| chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle') | |
| return chart | |
| def plot(domain, partition, models, rank, k, threshold, baseline, order, category, cat_options): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| expand_data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
| # task_plan.reset_index(inplace=True) | |
| if not os.path.exists(data_path) or not os.path.exists(expand_data_path): | |
| return None | |
| else: | |
| merged_df = pd.read_csv(data_path) | |
| merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) | |
| expand_df = pd.read_csv(expand_data_path) | |
| df = merged_df | |
| select_top = rank == "top" | |
| # Model X is good / bad at | |
| for model in models: | |
| if baseline: | |
| df = df[df[model] >= df[baseline]] | |
| else: | |
| if select_top: | |
| df = df[df[model] >= threshold] | |
| else: | |
| df = df[df[model] <= threshold] | |
| if not baseline: | |
| df['mean score'] = df[models].mean(axis=1) | |
| df = df.sort_values(by='mean score', ascending=False) | |
| df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] | |
| task_ids = list(df.index) | |
| if baseline: | |
| models += [baseline] | |
| chart_df = expand_df[expand_df['model'].isin(models)] | |
| chart_df = chart_df[chart_df['task id'].isin(task_ids)] | |
| if cat_options: | |
| df = chart_df[chart_df[category].isin(cat_options)] | |
| else: | |
| df = chart_df | |
| if baseline: | |
| model_str = (', '.join(models) if len(models) > 1 else models[0]) | |
| phrase = f'{model_str} perform' if len(models) > 1 else f'{model_str} performs' | |
| title = f"Are there any tasks where {phrase} better than {baseline} (by {category})?" | |
| else: | |
| title = f"What tasks are models {'best' if select_top else 'worst'} at by {category}?" | |
| if len(models) > 1: | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X('model:N', | |
| sort=alt.EncodingSortField(field=f'score', order=order, op="mean"), | |
| axis=alt.Axis(labels=False, tickSize=0, title=None)), | |
| alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
| alt.Color('model:N').legend(), | |
| alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) | |
| ).properties( | |
| width=200, | |
| height=100, | |
| title=title | |
| ) | |
| else: | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order=order, op="mean")), # no title, no label angle), | |
| alt.Y('mean(score):Q', scale=alt.Scale(zero=True)), | |
| alt.Color(f'{category}:N').legend(None), | |
| ).properties( | |
| width=200, | |
| height=100, | |
| title=f"What tasks is model {models[0]} {'best' if select_top else 'worst'} at by {category}?" | |
| ) | |
| chart = chart.configure_title(fontSize=15, offset=5, orient='top', anchor='middle') | |
| return chart | |
| def get_frequent_patterns(task_plan, scores): | |
| find_frequent_patterns(k=10, df=task_plan, scores=scores) | |
| def list_directories(path): | |
| """List all directories within a given path.""" | |
| return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] | |
| def update_category(domain, partition): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" | |
| if os.path.exists(data_path): | |
| data = pickle.load(open(data_path, 'rb')) | |
| categories = list(data.columns) | |
| category = gr.Dropdown(categories+["task id"], value=None, label="task metadata", interactive=True) | |
| return category | |
| else: | |
| return gr.Dropdown([], value=None, label="task metadata") | |
| def update_category2(domain, partition, existing_category): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" | |
| if os.path.exists(data_path): | |
| data = pickle.load(open(data_path, 'rb')) | |
| categories = list(data.columns) | |
| if existing_category and existing_category in categories: | |
| categories.remove(existing_category) | |
| category = gr.Dropdown(categories, value=None, label="Optional: second task metadata", interactive=True) | |
| return category | |
| else: | |
| return gr.Dropdown([], value=None, label="task metadata") | |
| def update_partition(domain): | |
| domain = domain2folder[domain] | |
| path = f"{BASE_DIR}/{domain}" | |
| if os.path.exists(path): | |
| partitions = list_directories(path) | |
| return gr.Dropdown(partitions, value=partitions[0], label="task space of the following task generator") | |
| else: | |
| return gr.Dropdown([], value=None, label="task space of the following task generator") | |
| def update_k(domain, partition, category=None): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| if os.path.exists(data_path): | |
| data = pd.read_csv(data_path) | |
| max_k = len(data[category].unique()) if category and category != "task id" else len(data) | |
| mid = max_k // 2 | |
| return gr.Slider(1, max_k, mid, step=1.0, label="k") | |
| else: | |
| return gr.Slider(1, 1, 1, step=1.0, label="k") | |
| # def update_category_values(domain, partition, category): | |
| # data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| # if os.path.exists(data_path) and category is not None: | |
| # data = pd.read_csv(data_path) | |
| # uni_cats = list(data[category].unique()) | |
| # return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values") | |
| # else: | |
| # return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values") | |
| # def update_category_values(domain, partition, models, rank, k, threshold, baseline, category): | |
| # data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| # if not os.path.exists(data_path): | |
| # return gr.Dropdown([], multiselect=True, value=None, interactive=False, label="category values") | |
| # else: | |
| # merged_df = pd.read_csv(data_path) | |
| # merged_df.rename(columns={'llavav1.5-7b': 'llava15-7b', 'llavav1.5-13b': 'llava15-13b'}, inplace=True) | |
| # df = merged_df | |
| # select_top = rank == "top" | |
| # # Model X is good / bad at | |
| # for model in models: | |
| # if baseline: | |
| # df = df[df[model] >= df[baseline]] | |
| # else: | |
| # if select_top: | |
| # df = df[df[model] >= threshold] | |
| # else: | |
| # df = df[df[model] <= threshold] | |
| # if not baseline: | |
| # df['mean score'] = df[models].mean(axis=1) | |
| # df = df.sort_values(by='mean score', ascending=False) | |
| # df = df.iloc[:k, :] if select_top else df.iloc[-k:, :] | |
| # uni_cats = list(df[category].unique()) | |
| # return gr.Dropdown(uni_cats, multiselect=True, value=None, interactive=True, label="category values") | |
| def update_tasks(domain, partition, find_pattern): | |
| domain = domain2folder[domain] | |
| if find_pattern == "yes": | |
| k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=True) | |
| pattern = gr.Dropdown([], value=None, interactive=True, label="pattern") | |
| category1 = gr.Dropdown([], value=None, interactive=False, label="task metadata") | |
| return [k1, pattern, category1] | |
| else: | |
| k1 = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False) | |
| pattern = gr.Dropdown([], value=None, interactive=False, label="pattern") | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| if os.path.exists(data_path): | |
| data = pd.read_csv(data_path) | |
| non_columns = MODELS + ['question', 'answer'] | |
| categories = [cat for cat in list(data.columns) if cat not in non_columns] | |
| category1 = gr.Dropdown(categories, value=categories[0], interactive=True, label="task metadata") | |
| else: | |
| category1 = gr.Dropdown([], value=None, label="task metadata") | |
| return [k1, pattern, category1] | |
| def update_pattern(domain, partition, k): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/patterns.pkl" | |
| if not os.path.exists(data_path): | |
| return gr.Dropdown([], value=None, interactive=False, label="pattern") | |
| else: | |
| results = pickle.load(open(data_path, 'rb')) | |
| patterns = results[0] | |
| patterns = [str(p) for p in patterns] | |
| print(patterns) | |
| return gr.Dropdown(patterns[:k], value=None, interactive=True, label="pattern") | |
| def update_threshold(domain, partition, baseline): | |
| domain = domain2folder[domain] | |
| print(baseline) | |
| if baseline: | |
| rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=False) | |
| k = gr.Slider(1, 10000, 10, step=1.0, label="k", interactive=False) | |
| threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=False) | |
| return [rank, k, threshold] | |
| else: | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| if os.path.exists(data_path): | |
| data = pd.read_csv(data_path) | |
| max_k = len(data) | |
| print(max_k) | |
| k = gr.Slider(1, max_k, 10, step=1.0, label="k", interactive=True) | |
| else: | |
| k = gr.Slider(1, 1, 1, step=1.0, label="k") | |
| rank = gr.Radio(['top', 'bottom'], value='top', label="rank", interactive=True) | |
| threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True) | |
| return [rank, k, threshold] | |
| def calc_surprisingness(model, scores, embeddings, k): | |
| scores = scores[model].to_numpy() | |
| sim = embeddings @ embeddings.T | |
| # print("sim values:", sim.shape, sim) | |
| indices = np.argsort(-sim)[:, :k] | |
| # print("indices:", indices.shape, indices) | |
| score_diff = scores[:, None] - scores[indices] | |
| # print("score differences:", score_diff.shape, score_diff) | |
| sim = sim[np.arange(len(scores))[:, None], indices] | |
| # print("top10 sim:", sim.shape, sim) | |
| all_surprisingness = score_diff * sim | |
| # print("all surprisingness:", all_surprisingness.shape, all_surprisingness) | |
| mean_surprisingness = np.mean(score_diff * sim, axis=1) | |
| res = {'similarity': sim, | |
| 'task index': indices, | |
| 'score difference': score_diff, | |
| 'all surprisingness': all_surprisingness, | |
| 'mean surprisingness': mean_surprisingness | |
| } | |
| return res | |
| def plot_surprisingness(domain, partition, model, rank, k, num_neighbors): | |
| domain = domain2folder[domain] | |
| # model = model[0] | |
| model_str = model.replace("-", "_") | |
| # sp_path = f"{BASE_DIR}/{domain}/{partition}/surprise_data.csv" | |
| sp_pkl = f"{BASE_DIR}/{domain}/{partition}/{model_str}_surprise.pkl" | |
| merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| if os.path.exists(sp_pkl) and os.path.exists(merged_path): # and not os.path.exists(sp_path) | |
| # if os.path.exists(sp_path): | |
| # sp_df = pd.read_csv(sp_path) | |
| # # res = calc_surprisingness(model, scores, embeds, num_neighbors) | |
| # # k = 10 | |
| # model = 'qwenvl' | |
| # num_neighbors = 10 | |
| # if os.path.exists(sp_pkl): | |
| res = pickle.load(open(sp_pkl, 'rb')) | |
| total_num_task = res['task index'].shape[0] | |
| all_records = [] | |
| for i in range(total_num_task): | |
| mean_surprisingness = np.mean(res['all surprisingness'][i, :num_neighbors]) | |
| for j in range(num_neighbors): | |
| neighbor_id = res['task index'][i, j] | |
| score_diff = res['score difference'][i, j] | |
| surprisingness = res['all surprisingness'][i, j] | |
| similarity = res['similarity'][i, j] | |
| record = {"task id": i, | |
| "neighbor rank": j, | |
| "neighbor id": neighbor_id, | |
| "score difference": score_diff, | |
| "surprisingness": surprisingness, | |
| "mean surprisingness": mean_surprisingness, | |
| "similarity": similarity | |
| } | |
| # print(record) | |
| all_records.append(record) | |
| sp_df = pd.DataFrame.from_records(all_records) | |
| sp_df = sp_df.sort_values(by="mean surprisingness", ascending=False) | |
| num_rows = k * num_neighbors | |
| df = sp_df.iloc[:num_rows, :] if rank == "top" else sp_df.iloc[-num_rows:, :] | |
| print(len(df)) | |
| df['is target'] = df.apply(lambda row: int(row['task id'] == row['neighbor id']), axis=1) | |
| merged_df = pd.read_csv(merged_path) | |
| for col in merged_df.columns: | |
| df[col] = df.apply(lambda row: merged_df.iloc[int(row['neighbor id']), :][col], axis=1) | |
| tooltips = ['neighbor id'] + ['image', 'question', 'answer', model] | |
| print(df.head()) | |
| pts = alt.selection_point(encodings=['x']) | |
| embeds = alt.Chart(df).mark_point(size=30, filled=True).encode( | |
| alt.OpacityValue(0.5), | |
| alt.X('x:Q', scale=alt.Scale(zero=False)), | |
| alt.Y('y:Q', scale=alt.Scale(zero=False)), | |
| alt.Color(f'{model}:Q'), #scale=alt.Scale(domain=[1, 0.5, 0], range=['blue', 'white', 'red'], interpolate='rgb') | |
| alt.Size("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=[300, 500])), | |
| alt.Shape("is target:N", legend=None, scale=alt.Scale(domain=[0, 1], range=['circle', 'triangle'])), | |
| alt.Order("is target:N"), | |
| tooltip=tooltips, | |
| ).properties( | |
| width=400, | |
| height=400, | |
| title=f"What are the tasks {model} is surprisingly {'good' if rank == 'top' else 'bad'} at compared to {num_neighbors} similar tasks?" | |
| ).transform_filter( | |
| pts | |
| ) | |
| bar = alt.Chart(df).mark_bar().encode( | |
| alt.Y('mean(mean surprisingness):Q'), | |
| alt.X('task id:N', sort=alt.EncodingSortField(field='mean surprisingness', order='descending')), | |
| color=alt.condition(pts, alt.ColorValue("steelblue"), alt.ColorValue("grey")), # | |
| ).add_params(pts).properties( | |
| width=400, | |
| height=200, | |
| ) | |
| chart = alt.hconcat( | |
| bar, | |
| embeds | |
| ).resolve_legend( | |
| color="independent", | |
| size="independent" | |
| ).configure_title( | |
| fontSize=20 | |
| ).configure_legend( | |
| labelFontSize=10, | |
| titleFontSize=10, | |
| ) | |
| return chart | |
| else: | |
| print(sp_pkl, merged_path) | |
| return None | |
| def plot_task_distribution(domain, partition, category): | |
| domain = domain2folder[domain] | |
| task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", "rb")) | |
| task_plan.reset_index(inplace=True) | |
| col_name = category | |
| task_plan_cnt = task_plan.groupby(col_name)['index'].count().reset_index() | |
| task_plan_cnt.rename(columns={'index': 'count'}, inplace=True) | |
| task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(task_plan) * 100, 2) | |
| task_plan_cnt.head() | |
| base = alt.Chart(task_plan_cnt).encode( | |
| alt.Theta("count:Q").stack(True), | |
| alt.Color(f"{col_name}:N").legend(), | |
| tooltip=[col_name, 'count', 'frequency (%)'] | |
| ) | |
| pie = base.mark_arc(outerRadius=120) | |
| return pie | |
| def plot_all(domain, partition, models, category1, category2, agg): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
| if not os.path.exists(data_path): | |
| return None | |
| expand_df = pd.read_csv(data_path) | |
| chart_df = expand_df[expand_df['model'].isin(models)] | |
| if category2: | |
| color_val = f'{agg}(score):Q' | |
| chart = alt.Chart(chart_df).mark_rect().encode( | |
| alt.X(f'{category1}:N', sort=alt.EncodingSortField(field='score', order='ascending', op=agg)), | |
| alt.Y(f'{category2}:N', sort=alt.EncodingSortField(field='score', order='descending', op=agg)), # no title, no label angle), | |
| alt.Color(color_val), | |
| alt.Tooltip('score', aggregate=agg, title=f"{agg} score"), | |
| ).properties( | |
| width=800, | |
| height=200, | |
| ) | |
| else: | |
| category = "index" if category1 == "task id" else category1 | |
| # cat_options = list(chart_df[category].unique()) | |
| # cat_options = cat_options[:5] | |
| y_val = f'{agg}(score):Q' | |
| df = chart_df | |
| # df = chart_df[chart_df[category].isin(cat_options)] | |
| if len(models) > 1: | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X('model:N', | |
| sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg), | |
| axis=alt.Axis(labels=False, tickSize=0, title=None)), | |
| alt.Y(y_val, scale=alt.Scale(zero=True)), | |
| alt.Color('model:N').legend(), | |
| alt.Column(f'{category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom')) | |
| ).properties( | |
| width=200, | |
| height=100, | |
| title=f"How do models perform across {category}?" | |
| ) | |
| else: | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X(f'{category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=agg)), # no title, no label angle), | |
| alt.Y(y_val, scale=alt.Scale(zero=True)), | |
| alt.Color(f'{category}:N').legend(None), | |
| ).properties( | |
| width=200, | |
| height=100, | |
| title=f"How does {models[0]} perform across {category}?" | |
| ) | |
| chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis( | |
| labelFontSize=20, | |
| titleFontSize=20, | |
| ).configure_legend( | |
| labelFontSize=15, | |
| titleFontSize=15, | |
| ) | |
| return chart | |
| def update_widgets(domain, partition, category, query_type): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
| if not os.path.exists(data_path): | |
| print("here?") | |
| return [None] * 11 | |
| df = pd.read_csv(data_path) | |
| max_k = len(df[category].unique()) if category and category != "task id" else len(df) | |
| widgets = [] | |
| if query_type == "top k": | |
| # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
| rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) | |
| k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", interactive=True, visible=True) | |
| model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)'", multiselect=True, interactive=True, visible=True) | |
| # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True) | |
| model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
| baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) | |
| direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) | |
| threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) | |
| baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) | |
| md1 = gr.Markdown(r"<h2>ranked by the </h2>") | |
| md2 = gr.Markdown(r"<h2>accuracy</h2>") | |
| md3 = gr.Markdown(r"") | |
| elif query_type == "threshold": | |
| # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=True) | |
| # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
| model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)'", multiselect=True, interactive=True, visible=True) | |
| direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True) | |
| threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True) | |
| # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate", interactive=True, visible=True) | |
| model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
| rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) | |
| k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) | |
| baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) | |
| baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) | |
| md1 = gr.Markdown(r"<h2>where the</h2>") | |
| md2 = gr.Markdown(r"<h2>accuracy is</h2>") | |
| md3 = gr.Markdown(r"") | |
| elif query_type == "model comparison": | |
| model = gr.Dropdown(MODELS, value=MODELS[0], label="of model(s)' accuracy", multiselect=True, interactive=True, visible=True) | |
| baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)' accuracy", multiselect=True, interactive=True, visible=True) | |
| direction = gr.Radio(['above', 'below'], value='above', label=" ", interactive=True, visible=True) | |
| threshold = gr.Slider(0, 1, 0.0, label="threshold", interactive=True, visible=True) | |
| model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
| # baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", interactive=True, visible=True) | |
| baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
| # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task aggregate", interactive=True, visible=False) | |
| rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) | |
| k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) | |
| md1 = gr.Markdown(r"<h2>where the difference between the </h2>") | |
| md2 = gr.Markdown(r"<h2>is </h2>") | |
| md3 = gr.Markdown(r"<h2>and the</h2>") | |
| elif query_type == "model debugging": | |
| model = gr.Dropdown(MODELS, value=MODELS[0], label="model's", multiselect=False, interactive=True, visible=True) | |
| # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", visible=False) | |
| baseline = gr.Dropdown(MODELS, value=None, label="baseline", visible=False) | |
| direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) | |
| threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) | |
| rank = gr.Radio(['top', 'bottom'], value='top', label=" ", visible=False) | |
| k = gr.Slider(1, max_k, max_k // 2, step=1.0, label="k", visible=False) | |
| model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over models)", visible=False) | |
| baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="baseline aggregate", visible=False) | |
| md1 = gr.Markdown(r"<h2>where </h2>") | |
| md2 = gr.Markdown(r"<h2>mean accuracy is below its overall mean accuracy by one standard deviation</h2>") | |
| md3 = gr.Markdown(r"") | |
| else: | |
| widgets = [None] * 11 | |
| widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3] | |
| return widgets | |
| def select_tasks(domain, partition, category, query_type, task_agg, models, model_agg, rank, k, direction, threshold, baselines, baseline_agg): | |
| domain = domain2folder[domain] | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
| merged_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| if not os.path.exists(data_path) or not os.path.exists(merged_path): | |
| return gr.DataFrame(None) | |
| df = pd.read_csv(data_path) | |
| merged_df = pd.read_csv(merged_path) | |
| task_plan = pickle.load(open(f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl", 'rb')) | |
| task_plan.reset_index(inplace=True) | |
| if not category or category == "task id": | |
| category = 'index' | |
| if query_type == "top k": | |
| df = df[df['model'].isin(models)] | |
| df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() | |
| df = df.groupby([category])['score'].agg(model_agg).reset_index() | |
| df = df.sort_values(by='score', ascending=False) | |
| if rank == "bottom": | |
| df = df.iloc[-k:, :] | |
| else: | |
| df = df.iloc[:k, :] | |
| elif query_type == "threshold": | |
| df = df[df['model'].isin(models)] | |
| df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() | |
| df = df.groupby([category])['score'].agg(model_agg).reset_index() | |
| if direction == "below": | |
| df = df[df['score'] <= threshold] | |
| else: | |
| df = df[df['score'] >= threshold] | |
| elif query_type == "model comparison": | |
| # df = merged_df | |
| # df.reset_index(inplace=True) | |
| # df = df.groupby([category])[[model, baseline]].agg(task_agg).reset_index() | |
| # df = df[(df[model] - df[baseline] > threshold)] | |
| df_baseline = deepcopy(df) | |
| df = df[df['model'].isin(models)] | |
| df = df.groupby([category, 'model'])['score'].agg(task_agg).reset_index() | |
| df = df.groupby([category])['score'].agg(model_agg).reset_index() | |
| model_str = ', '.join(models) | |
| exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str | |
| df = df.sort_values(by=category) | |
| df_baseline = df_baseline[df_baseline['model'].isin(baselines)] | |
| df_baseline = df_baseline.groupby([category, 'model'])['score'].agg(task_agg).reset_index() | |
| df_baseline = df_baseline.groupby([category])['score'].agg(baseline_agg).reset_index() | |
| model_str = ', '.join(baselines) | |
| baseline_score_id = f'{baseline_agg}({model_str})' if len(baselines) > 1 else model_str | |
| df_baseline = df_baseline.sort_values(by=category) | |
| df.rename(columns={'score': exp_score_id}, inplace=True) | |
| df_baseline.rename(columns={'score': baseline_score_id}, inplace=True) | |
| df = pd.merge(df, df_baseline, on=category) | |
| df = df[(df[exp_score_id] - df[baseline_score_id] > threshold)] | |
| elif query_type == "model debugging": | |
| model = models | |
| print(models) | |
| avg_acc = merged_df[model].mean() | |
| std = merged_df[model].std() | |
| t = avg_acc - std | |
| df = df[df['model'] == model] | |
| df = df.groupby(['model', category])['score'].agg(task_agg).reset_index() | |
| df = df[df['score'] < t] | |
| df['mean'] = round(avg_acc, 4) | |
| df['std'] = round(std, 4) | |
| print(df.head()) | |
| if category == 'index': | |
| task_attrs = list(df[category]) | |
| selected_tasks = task_plan[task_plan[category].isin(task_attrs)] | |
| if len(selected_tasks) == 0: | |
| return gr.DataFrame(None, label="There is no such task.") | |
| if query_type == "model comparison" and (models and baselines): | |
| # selected_tasks[model] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][model].values[0], axis=1) | |
| # selected_tasks[baseline] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline].values[0], axis=1) | |
| selected_tasks[exp_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][exp_score_id].values[0], axis=1) | |
| selected_tasks[baseline_score_id] = selected_tasks.apply(lambda row: df[df['index'] == row['index']][baseline_score_id].values[0], axis=1) | |
| else: | |
| selected_tasks['score'] = selected_tasks.apply(lambda row: df[df['index'] == row['index']]['score'].values[0], axis=1) | |
| print(selected_tasks.head()) | |
| return gr.DataFrame(selected_tasks, label=f"There are {len(selected_tasks)} (out of {len(task_plan)}) tasks in total.") | |
| else: | |
| if len(df) == 0: | |
| return gr.DataFrame(None, label=f"There is no such {category}.") | |
| else: | |
| return gr.DataFrame(df, label=f"The total number of such {category} is {len(df)}.") | |
| def find_patterns(selected_tasks, num_patterns, models, baselines, model_agg, baseline_agg): | |
| if len(selected_tasks) == 0: | |
| return gr.DataFrame(None) | |
| print(selected_tasks.head()) | |
| if 'score' in selected_tasks: | |
| scores = selected_tasks['score'] | |
| # elif model in selected_tasks: | |
| # scores = selected_tasks[model] | |
| else: | |
| scores = None | |
| print(scores) | |
| model_str = ', '.join(models) | |
| exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str | |
| if baselines: | |
| baseline_str = ', '.join(baselines) | |
| baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str | |
| tasks_only = selected_tasks | |
| all_score_cols = ['score', exp_score_id] | |
| if baselines: | |
| all_score_cols += [baseline_score_id] | |
| for name in all_score_cols: | |
| if name in selected_tasks: | |
| tasks_only = tasks_only.drop(name, axis=1) | |
| results = find_frequent_patterns(k=num_patterns, df=tasks_only, scores=scores) | |
| records = [] | |
| if scores is not None: | |
| patterns, scores = results[0], results[1] | |
| for pattern, score in zip(patterns, scores): | |
| pattern_str = "" | |
| for t in pattern[1]: | |
| col_name, col_val = t | |
| pattern_str += f"{col_name} = {col_val}, " | |
| record = {'pattern': pattern_str[:-2], 'count': pattern[0], 'score': score} #{model} | |
| records.append(record) | |
| else: | |
| patterns = results | |
| for pattern in patterns: | |
| pattern_str = "" | |
| for t in pattern[1]: | |
| col_name, col_val = t | |
| pattern_str += f"{col_name} = {col_val}, " | |
| record = {'pattern': pattern_str[:-2], 'count': pattern[0]} | |
| records.append(record) | |
| df = pd.DataFrame.from_records(records) | |
| return gr.DataFrame(df) | |
| def visualize_task_distribution(selected_tasks, col_name, model1, model2): | |
| if not col_name: | |
| return None | |
| task_plan_cnt = selected_tasks.groupby(col_name)['index'].count().reset_index() | |
| task_plan_cnt.rename(columns={'index': 'count'}, inplace=True) | |
| task_plan_cnt['frequency (%)'] = round(task_plan_cnt['count'] / len(selected_tasks) * 100, 2) | |
| print(task_plan_cnt.head()) | |
| tooltips = [col_name, 'count', 'frequency (%)'] | |
| base = alt.Chart(task_plan_cnt).encode( | |
| alt.Theta("count:Q").stack(True), | |
| alt.Color(f"{col_name}:N").legend(), | |
| tooltip=tooltips | |
| ) | |
| pie = base.mark_arc(outerRadius=120) | |
| return pie | |
| def plot_performance_for_selected_tasks(domain, partition, df, query_type, models, baselines, select_category, vis_category, task_agg, model_agg, baseline_agg, rank, direction, threshold): | |
| domain = domain2folder[domain] | |
| task_agg = "mean" | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/expanded_data.csv" | |
| mereged_data_path = f"{BASE_DIR}/{domain}/{partition}/merged_data.csv" | |
| if not os.path.exists(data_path) or not os.path.exists(mereged_data_path) or len(df) == 0: | |
| return None | |
| select_tasks = select_category == "task id" and vis_category | |
| if select_tasks: # select tasks | |
| y_val = f'{task_agg}(score):Q' | |
| else: # select task categories | |
| y_val = f'score:Q' | |
| if select_category == "task id": | |
| select_category = "index" | |
| print(df.head()) | |
| if query_type == "model comparison": | |
| # re-format the data for plotting | |
| model_str = ', '.join(models) | |
| exp_score_id = f'{model_agg}({model_str})' if len(models) > 1 else model_str | |
| baseline_str = ', '.join(baselines) | |
| baseline_score_id = f'{baseline_agg}({baseline_str})' if len(baselines) > 1 else baseline_str | |
| # other_cols = list(df.columns) | |
| # other_cols.remove(select_category) | |
| print(exp_score_id, baseline_score_id) | |
| df = df.melt(id_vars=[select_category], value_vars=[exp_score_id, baseline_score_id]) | |
| df.rename(columns={'variable': 'model', 'value': 'score'}, inplace=True) | |
| print(df.head()) | |
| if select_tasks: | |
| merged_df = pd.read_csv(mereged_data_path) | |
| df[vis_category] = df.apply(lambda row: merged_df[merged_df.index == row['index']][vis_category].values[0], axis=1) | |
| num_columns = len(df['model'].unique()) * len(df[f'{vis_category}'].unique()) | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X('model:N', | |
| sort=alt.EncodingSortField(field=f'score', order='descending', op=task_agg), | |
| axis=alt.Axis(labels=False, tickSize=0, title=None)), | |
| alt.Y(y_val, scale=alt.Scale(zero=True), title="accuracy"), | |
| alt.Color('model:N').legend(), | |
| alt.Column(f'{vis_category}:N', header=alt.Header(titleOrient='bottom', labelOrient='bottom', labelFontSize=20, titleFontSize=20,)) | |
| ).properties( | |
| width=num_columns * 30, | |
| height=200, | |
| title=f"How do models perform by {vis_category}?" | |
| ) | |
| print(num_columns * 50) | |
| else: | |
| if query_type == "model debugging": | |
| y_title = "accuracy" | |
| plot_title = f"{models} performs worse than its (mean - std) on these {vis_category}s" | |
| models = [models] | |
| else: | |
| model_str = ', '.join(models) | |
| y_title = f"{model_agg} accuracy" if len(models) > 0 else "accuracy" | |
| suffix = f"on these tasks (by {vis_category})" if select_category == "index" else f"on these {vis_category}s" | |
| if query_type == "top k": | |
| plot_title = f"The {model_agg} accuracy of {model_str} is the {'highest' if rank == 'top' else 'lowest'} " + suffix | |
| elif query_type == "threshold": | |
| plot_title = f"The {model_agg} accuracy of {model_str} is {direction} {threshold} " + suffix | |
| if select_tasks: | |
| expand_df = pd.read_csv(data_path) | |
| task_ids = list(df['index'].unique()) | |
| # all_models = (models + baselines) if baselines else models | |
| df = expand_df[(expand_df['model'].isin(models)) & (expand_df['task id'].isin(task_ids))] | |
| num_columns = len(df[f'{vis_category}'].unique()) | |
| chart = alt.Chart(df).mark_bar().encode( | |
| alt.X(f'{vis_category}:N', sort=alt.EncodingSortField(field=f'score', order='ascending', op=task_agg), axis=alt.Axis(labelAngle=-45)), # no title, no label angle), | |
| alt.Y(y_val, scale=alt.Scale(zero=True), title=y_title), | |
| alt.Color(f'{vis_category}:N').legend(None), | |
| ).properties( | |
| width=num_columns * 30, | |
| height=200, | |
| title=plot_title | |
| ) | |
| chart = chart.configure_title(fontSize=20, offset=5, orient='top', anchor='middle').configure_axis( | |
| labelFontSize=20, | |
| titleFontSize=20, | |
| ).configure_legend( | |
| labelFontSize=20, | |
| titleFontSize=20, | |
| labelLimit=200, | |
| ) | |
| return chart | |
| def sync_vis_category(domain, partition, category): | |
| domain = domain2folder[domain] | |
| if category and category != "task id": | |
| return [gr.Dropdown([category], value=category, label="by task metadata", interactive=False), gr.Dropdown([category], value=category, label="by task metadata", interactive=False)] | |
| else: | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" | |
| if os.path.exists(data_path): | |
| data = pickle.load(open(data_path, 'rb')) | |
| categories = list(data.columns) | |
| return [gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True), gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True)] | |
| else: | |
| return [None, None] | |
| def hide_fpm_and_dist_components(domain, partition, category): | |
| domain = domain2folder[domain] | |
| print(category) | |
| if category and category != "task id": | |
| num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", visible=False) | |
| btn_pattern = gr.Button(value="Find patterns among tasks", visible=False) | |
| table = gr.DataFrame({}, height=250, visible=False) | |
| dist_chart = Plot(visible=False) | |
| col_name = gr.Dropdown([], value=None, label="by task metadata", visible=False) | |
| btn_dist = gr.Button(value="Visualize task distribution", visible=False) | |
| else: | |
| data_path = f"{BASE_DIR}/{domain}/{partition}/task_plan.pkl" | |
| if os.path.exists(data_path): | |
| data = pickle.load(open(data_path, 'rb')) | |
| categories = list(data.columns) | |
| col_name = gr.Dropdown(categories, value=categories[0], label="by task metadata", interactive=True, visible=True) | |
| else: | |
| col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True, visible=True) | |
| num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns", interactive=True, visible=True) | |
| btn_pattern = gr.Button(value="Find patterns among tasks", interactive=True, visible=True) | |
| table = gr.DataFrame({}, height=250, interactive=True, visible=True) | |
| dist_chart = Plot(visible=True) | |
| btn_dist = gr.Button(value="Visualize task distribution", interactive=True, visible=True) | |
| return [num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart] | |
| # domains = list_directories(BASE_DIR) | |
| theme = gr.Theme.from_hub('sudeepshouche/minimalist') | |
| theme.font = [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"] # gr.themes.GoogleFont("Source Sans Pro") # [gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"] | |
| theme.text_size = gr.themes.sizes.text_lg | |
| # theme = theme.set(font=) | |
| demo = gr.Blocks(theme=theme, title="TaskMeAnything-UI") # | |
| with demo: | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| r"" | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| r"<h1>Welcome to TaskMeAnything-UI! </h1>" | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| r"" | |
| ) | |
| with gr.Tab("📊 Overview"): | |
| gr.Markdown( | |
| r"<h2>📊 Visualize the overall task distribution and model performance </h2>" | |
| ) | |
| with gr.Row(): | |
| domain = gr.Radio(domains, label="scenario", scale=2) | |
| partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) | |
| # domain.change(fn=update_partition, inputs=domain, outputs=partition) | |
| gr.Markdown( | |
| r"<h2>Overall task metadata distribution</h2>" | |
| ) | |
| with gr.Row(): | |
| category = gr.Dropdown([], value=None, label="task metadata") | |
| partition.change(fn=update_category, inputs=[domain, partition], outputs=category) | |
| with gr.Row(): | |
| output = Plot() | |
| with gr.Row(): | |
| btn = gr.Button(value="Plot") | |
| btn.click(plot_task_distribution, [domain, partition, category], output) | |
| gr.Markdown( | |
| r"<h2>Models' overall performance by task metadata</h2>" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| models = gr.CheckboxGroup(MODELS, label="model(s)", value=MODELS) | |
| with gr.Column(scale=1): | |
| aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="aggregate models' accuracy by") | |
| with gr.Row(): | |
| # with gr.Column(scale=1): | |
| category1 = gr.Dropdown([], value=None, label="task metadata", interactive=True) | |
| category2 = gr.Dropdown([], value=None, label="Optional: second task metadata", interactive=True) | |
| partition.change(fn=update_category, inputs=[domain, partition], outputs=category1) | |
| category1.change(fn=update_category2, inputs=[domain, partition, category1], outputs=category2) | |
| domain.change(fn=update_partition_and_models, inputs=domain, outputs=[partition, models]) | |
| with gr.Row(): | |
| output = Plot() | |
| with gr.Row(): | |
| btn = gr.Button(value="Plot") | |
| btn.click(plot_all, [domain, partition, models, category1, category2, aggregate], output) | |
| # gr.Examples(["hello", "bonjour", "merhaba"], input_textbox) | |
| with gr.Tab("✨ Embedding"): | |
| gr.Markdown( | |
| r"<h2>✨ Visualize the tasks' embeddings in the 2D space </h2>" | |
| ) | |
| with gr.Row(): | |
| domain2 = gr.Radio(domains, label="scenario", scale=2) | |
| # domain = gr.Dropdown(domains, value=domains[0], label="scenario") | |
| partition2 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) | |
| category2 = gr.Dropdown([], value=None, label="colored by task metadata", scale=1) | |
| domain2.change(fn=update_partition, inputs=domain2, outputs=partition2) | |
| partition2.change(fn=update_category, inputs=[domain2, partition2], outputs=category2) | |
| with gr.Row(): | |
| output2 = Plot() | |
| with gr.Row(): | |
| btn = gr.Button(value="Run") | |
| btn.click(plot_embedding, [domain2, partition2, category2], output2) | |
| with gr.Tab("❓ Query"): | |
| gr.Markdown( | |
| r"<h2>❓ Find out the answers to your queries by finding and visualizing the relevant tasks and models' performance </h2>" | |
| ) | |
| with gr.Row(equal_height=True): | |
| domain = gr.Radio(domains, label="scenario", scale=2) | |
| partition = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) | |
| with gr.Row(): | |
| query1 = "top k" | |
| query2 = "threshold" | |
| query3 = "model debugging" | |
| query4 = "model comparison" | |
| query_type = gr.Radio([query1, query2, query3, query4], value="top k", label=r"query type") | |
| with gr.Row(): | |
| with gr.Accordion("See more details about the query type"): | |
| gr.Markdown( | |
| r"<ul><li>Top k: Find the k tasks or task metadata that the model(s) perform the best or worst on</li><li>Threshold: Find the tasks or task metadata where the model(s)' performance is greater or lower than a given threshold t</li><li>Model debugging: Find the tasks or task metadata where a model performs significantly worse than its average performance (by one standard deviation)</li><li>Model comparison: Find the tasks or task metadata where some model(s) perform better or worse than the baseline(s) by a given threshold t</li></ul>" | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(r"<h2>Help me find the</h2>") | |
| with gr.Row(equal_height=True): | |
| # with gr.Column(scale=1): | |
| rank = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) | |
| # with gr.Column(scale=2): | |
| k = gr.Slider(1, 10, 5 // 2, step=1.0, label="k", interactive=True, visible=True) | |
| # with gr.Column(scale=2): | |
| category = gr.Dropdown([], value=None, label="tasks / task metadata", interactive=True) | |
| with gr.Row(): | |
| md1 = gr.Markdown(r"<h2>ranked by the </h2>") | |
| with gr.Row(equal_height=True): | |
| # with gr.Column(scale=1, min_width=100): | |
| # model_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
| model_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1) | |
| # with gr.Column(scale=8): | |
| model = gr.Dropdown(MODELS, value=MODELS, label="of model(s)", multiselect=True, interactive=True, visible=True, scale=2) | |
| # with gr.Column(scale=1, min_width=100): | |
| # aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True, scale=1) | |
| with gr.Row(): | |
| md3 = gr.Markdown(r"") | |
| with gr.Row(equal_height=True): | |
| baseline_aggregate = gr.Dropdown(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=False, scale=1) | |
| baseline = gr.Dropdown(MODELS, value=None, label="of baseline(s)'", visible=False, scale=2) | |
| # aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label=" ", interactive=True, visible=True) | |
| # with gr.Column(scale=1, min_width=50): | |
| with gr.Row(): | |
| md2 = gr.Markdown(r"<h2>accuracy</h2>") | |
| with gr.Row(): | |
| # baseline_aggregate = gr.Radio(['mean', 'median', 'min', 'max'], value="mean", label="task category aggregate (over baselines)", visible=False) | |
| direction = gr.Radio(['above', 'below'], value='above', label=" ", visible=False) | |
| threshold = gr.Slider(0, 1, 0.0, label="threshold", visible=False) | |
| widgets = [rank, k, direction, threshold, model, model_aggregate, baseline, baseline_aggregate, md1, md2, md3] | |
| partition.change(fn=update_category, inputs=[domain, partition], outputs=category) | |
| query_type.change(update_widgets, [domain, partition, category, query_type], widgets) | |
| domain.change(fn=update_partition_and_models_and_baselines, inputs=domain, outputs=[partition, model, baseline]) | |
| with gr.Row(): | |
| df = gr.DataFrame({}, height=200) | |
| btn = gr.Button(value="Find tasks / task metadata") | |
| btn.click(select_tasks, [domain, partition, category, query_type, aggregate, model, model_aggregate, rank, k, direction, threshold, baseline, baseline_aggregate], df) | |
| with gr.Row(): | |
| plot = Plot() | |
| with gr.Row(): | |
| col_name2 = gr.Dropdown([], value=None, label="by task metadata", interactive=True) | |
| partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name2) | |
| btn_plot = gr.Button(value="Plot model performance", interactive=True) | |
| btn_plot.click(plot_performance_for_selected_tasks, [domain, partition, df, query_type, model, baseline, category, col_name2, aggregate, model_aggregate, baseline_aggregate, rank, direction, threshold], plot) | |
| with gr.Row(): | |
| dist_chart = Plot() | |
| with gr.Row(): | |
| col_name = gr.Dropdown([], value=None, label="by task metadata", interactive=True) | |
| partition.change(fn=update_category, inputs=[domain, partition], outputs=col_name) | |
| btn_dist = gr.Button(value="Visualize task distribution", interactive=True) | |
| btn_dist.click(visualize_task_distribution, [df, col_name, model, baseline], dist_chart) | |
| with gr.Row(): | |
| table = gr.DataFrame({}, height=250) | |
| with gr.Row(): | |
| num_patterns = gr.Slider(1, 100, 50, step=1.0, label="number of patterns") | |
| btn_pattern = gr.Button(value="Find patterns among tasks") | |
| btn_pattern.click(find_patterns, [df, num_patterns, model, baseline], table) | |
| category.change(fn=hide_fpm_and_dist_components, inputs=[domain, partition, category], outputs=[num_patterns, btn_pattern, table, col_name, btn_dist, dist_chart]) | |
| category.change(fn=sync_vis_category, inputs=[domain, partition, category], outputs=[col_name, col_name2]) | |
| category.change(fn=update_k, inputs=[domain, partition, category], outputs=k) | |
| with gr.Tab("😮 Surprisingness"): | |
| gr.Markdown(r"<h2>😮 Find out the tasks a model is surprisingly good or bad at compared to similar tasks</h2>") | |
| with gr.Row(): | |
| domain3 = gr.Radio(domains, label="scenario", scale=2) | |
| partition3 = gr.Dropdown([], value=None, label="task space of the following task generator", scale=1) | |
| with gr.Row(): | |
| model3 = gr.Dropdown(MODELS, value=MODELS[0], label="model", interactive=True, visible=True) | |
| k3 = gr.Slider(1, 100, 50, step=1.0, label="number of surprising tasks", interactive=True) | |
| num_neighbors = gr.Slider(1, 100, 50, step=1.0, label="number of neighbors", interactive=True) | |
| rank3 = gr.Radio(['top', 'bottom'], value='top', label=" ", interactive=True, visible=True) | |
| domain3.change(fn=update_partition_and_models, inputs=domain3, outputs=[partition3, model3]) | |
| # partition3.change(fn=update_k, inputs=[domain3, partition3], outputs=k3) | |
| with gr.Row(): | |
| output3 = Plot() | |
| with gr.Row(): | |
| btn = gr.Button(value="Plot") | |
| btn.click(plot_surprisingness, [domain3, partition3, model3, rank3, k3, num_neighbors], output3) | |
| # if __name__ == "__main__": | |
| demo.launch(share=True) | |