Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -45,14 +45,15 @@ st.set_page_config(
|
|
| 45 |
}
|
| 46 |
)
|
| 47 |
|
| 48 |
-
|
| 49 |
-
'task'
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
| 51 |
tokenizer_name_or_path = f'malteos/aspect-scibert-{aspects[0]}' # any aspect
|
| 52 |
dataset_config = 'malteos/aspect-paper-metadata'
|
| 53 |
|
| 54 |
-
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
|
| 55 |
-
|
| 56 |
|
| 57 |
@st.cache(show_spinner=False)
|
| 58 |
def st_load_model(name_or_path):
|
|
@@ -63,7 +64,7 @@ def st_load_model(name_or_path):
|
|
| 63 |
|
| 64 |
@st.cache(show_spinner=False)
|
| 65 |
def st_load_dataset(name_or_path):
|
| 66 |
-
with st.spinner('Loading the dataset (this might take a while)...'):
|
| 67 |
dataset = load_dataset(name_or_path)
|
| 68 |
|
| 69 |
if isinstance(dataset, DatasetDict):
|
|
@@ -84,6 +85,7 @@ aspect_to_model = dict(
|
|
| 84 |
dataset = st_load_dataset(dataset_config)
|
| 85 |
|
| 86 |
|
|
|
|
| 87 |
def get_paper(doc_id):
|
| 88 |
res = requests.get(f'https://api.semanticscholar.org/v1/paper/{doc_id}')
|
| 89 |
|
|
@@ -93,32 +95,35 @@ def get_paper(doc_id):
|
|
| 93 |
raise ValueError(f'Cannot load paper from S2 API: {doc_id}')
|
| 94 |
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
def find_related_papers(paper_id, user_aspect):
|
| 97 |
with st.spinner('Searching for related papers...'):
|
|
|
|
| 98 |
|
| 99 |
paper = get_paper(paper_id)
|
| 100 |
|
| 101 |
if paper is None or 'title' not in paper or paper['title'] is None or 'abstract' not in paper or paper['abstract'] is None:
|
| 102 |
-
raise ValueError(f'Could not retrieve title and abstract for input paper: {paper_id}')
|
| 103 |
|
| 104 |
title_abs = paper['title'] + ': ' + paper['abstract']
|
| 105 |
|
| 106 |
-
# preprocess the input
|
| 107 |
-
inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
| 108 |
-
|
| 109 |
-
# inference
|
| 110 |
-
outputs = aspect_to_model[user_aspect](**inputs)
|
| 111 |
-
|
| 112 |
-
# logger.info(f'attention_mask: {inputs["attention_mask"].shape}')
|
| 113 |
-
#
|
| 114 |
-
# logger.info(f'Outputs: {outputs["last_hidden_state"]}')
|
| 115 |
-
# logger.info(f'Outputs: {outputs["last_hidden_state"].shape}')
|
| 116 |
-
|
| 117 |
-
# Mean pool the token-level embeddings to get sentence-level embeddings
|
| 118 |
-
embeddings = torch.sum(
|
| 119 |
-
outputs["last_hidden_state"] * inputs['attention_mask'].unsqueeze(-1), dim=1
|
| 120 |
-
) / torch.clamp(torch.sum(inputs['attention_mask'], dim=1, keepdims=True), min=1e-9)
|
| 121 |
-
|
| 122 |
result = dict(
|
| 123 |
paper=paper,
|
| 124 |
aspect=user_aspect,
|
|
@@ -129,7 +134,7 @@ def find_related_papers(paper_id, user_aspect):
|
|
| 129 |
))
|
| 130 |
|
| 131 |
# Retrieval
|
| 132 |
-
prompt =
|
| 133 |
scores, retrieved_examples = dataset.get_nearest_examples(f'{user_aspect}_embeddings', prompt, k=10)
|
| 134 |
|
| 135 |
result.update(dict(
|
|
@@ -144,9 +149,9 @@ st.title('Aspect-based Paper Similarity')
|
|
| 144 |
st.markdown("""This demo showcases [Specialized Document Embeddings for Aspect-based Research Paper Similarity](#TODO).""")
|
| 145 |
|
| 146 |
# Introduction
|
| 147 |
-
st.markdown(f"""The model was trained using a triplet loss on machine learning papers from the [paperswithcode.com](https://paperswithcode.com/) corpus with the objective of pulling embeddings of papers with the same task, method, or
|
| 148 |
For a more comprehensive overview of the model check out the [model card on π€ Model Hub]({model_hub_url}) or read [our paper](#TODO).""")
|
| 149 |
-
st.markdown("""Enter a ArXiv ID or a DOI of a paper for that you want find similar papers.
|
| 150 |
|
| 151 |
Try it yourself! π""",
|
| 152 |
unsafe_allow_html=True)
|
|
@@ -165,6 +170,8 @@ with st.form("aspect-input", clear_on_submit=False):
|
|
| 165 |
"ACL:N19-1423": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
|
| 166 |
"10.18653/v1/S16-1001": "SemEval-2016 Task 4: Sentiment Analysis in Twitter",
|
| 167 |
"10.1145/3065386": "ImageNet classification with deep convolutional neural networks",
|
|
|
|
|
|
|
| 168 |
}
|
| 169 |
|
| 170 |
example = st.selectbox(
|
|
@@ -175,7 +182,8 @@ with st.form("aspect-input", clear_on_submit=False):
|
|
| 175 |
|
| 176 |
user_aspect = st.radio(
|
| 177 |
label="In what aspect are you interested?",
|
| 178 |
-
options=aspects
|
|
|
|
| 179 |
)
|
| 180 |
|
| 181 |
cols = st.columns(3)
|
|
|
|
| 45 |
}
|
| 46 |
)
|
| 47 |
|
| 48 |
+
aspect_labels = {
|
| 49 |
+
'task': 'Task π― ',
|
| 50 |
+
'method': 'Method π¨ ',
|
| 51 |
+
'dataset': 'Dataset π·οΈ',
|
| 52 |
+
}
|
| 53 |
+
aspects = list(aspect_labels.keys())
|
| 54 |
tokenizer_name_or_path = f'malteos/aspect-scibert-{aspects[0]}' # any aspect
|
| 55 |
dataset_config = 'malteos/aspect-paper-metadata'
|
| 56 |
|
|
|
|
|
|
|
| 57 |
|
| 58 |
@st.cache(show_spinner=False)
|
| 59 |
def st_load_model(name_or_path):
|
|
|
|
| 64 |
|
| 65 |
@st.cache(show_spinner=False)
|
| 66 |
def st_load_dataset(name_or_path):
|
| 67 |
+
with st.spinner('Loading the dataset and search index (this might take a while)...'):
|
| 68 |
dataset = load_dataset(name_or_path)
|
| 69 |
|
| 70 |
if isinstance(dataset, DatasetDict):
|
|
|
|
| 85 |
dataset = st_load_dataset(dataset_config)
|
| 86 |
|
| 87 |
|
| 88 |
+
@st.cache(show_spinner=False)
|
| 89 |
def get_paper(doc_id):
|
| 90 |
res = requests.get(f'https://api.semanticscholar.org/v1/paper/{doc_id}')
|
| 91 |
|
|
|
|
| 95 |
raise ValueError(f'Cannot load paper from S2 API: {doc_id}')
|
| 96 |
|
| 97 |
|
| 98 |
+
def get_embedding(input_text, user_aspect):
|
| 99 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
|
| 100 |
+
|
| 101 |
+
# preprocess the input
|
| 102 |
+
inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
| 103 |
+
|
| 104 |
+
# inference
|
| 105 |
+
outputs = aspect_to_model[user_aspect](**inputs)
|
| 106 |
+
|
| 107 |
+
# Mean pool the token-level embeddings to get sentence-level embeddings
|
| 108 |
+
embeddings = torch.sum(
|
| 109 |
+
outputs["last_hidden_state"] * inputs['attention_mask'].unsqueeze(-1), dim=1
|
| 110 |
+
) / torch.clamp(torch.sum(inputs['attention_mask'], dim=1, keepdims=True), min=1e-9)
|
| 111 |
+
|
| 112 |
+
return embeddings.detach().numpy()[0]
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@st.cache(show_spinner=False)
|
| 116 |
def find_related_papers(paper_id, user_aspect):
|
| 117 |
with st.spinner('Searching for related papers...'):
|
| 118 |
+
paper_id = paper_id.strip() # remove white spaces
|
| 119 |
|
| 120 |
paper = get_paper(paper_id)
|
| 121 |
|
| 122 |
if paper is None or 'title' not in paper or paper['title'] is None or 'abstract' not in paper or paper['abstract'] is None:
|
| 123 |
+
raise ValueError(f'Could not retrieve title and abstract for input paper (the paper is probably behind a paywall): {paper_id}')
|
| 124 |
|
| 125 |
title_abs = paper['title'] + ': ' + paper['abstract']
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
result = dict(
|
| 128 |
paper=paper,
|
| 129 |
aspect=user_aspect,
|
|
|
|
| 134 |
))
|
| 135 |
|
| 136 |
# Retrieval
|
| 137 |
+
prompt = get_embedding(title_abs, user_aspect)
|
| 138 |
scores, retrieved_examples = dataset.get_nearest_examples(f'{user_aspect}_embeddings', prompt, k=10)
|
| 139 |
|
| 140 |
result.update(dict(
|
|
|
|
| 149 |
st.markdown("""This demo showcases [Specialized Document Embeddings for Aspect-based Research Paper Similarity](#TODO).""")
|
| 150 |
|
| 151 |
# Introduction
|
| 152 |
+
st.markdown(f"""The model was trained using a triplet loss on machine learning papers from the [paperswithcode.com](https://paperswithcode.com/) corpus with the objective of pulling embeddings of papers with the same task, method, or dataset close together.
|
| 153 |
For a more comprehensive overview of the model check out the [model card on π€ Model Hub]({model_hub_url}) or read [our paper](#TODO).""")
|
| 154 |
+
st.markdown("""Enter a ArXiv ID or a DOI of a paper for that you want find similar papers. The title and abstract of the input paper must be available through the [Semantic Scholar API](https://www.semanticscholar.org/product/api).
|
| 155 |
|
| 156 |
Try it yourself! π""",
|
| 157 |
unsafe_allow_html=True)
|
|
|
|
| 170 |
"ACL:N19-1423": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
|
| 171 |
"10.18653/v1/S16-1001": "SemEval-2016 Task 4: Sentiment Analysis in Twitter",
|
| 172 |
"10.1145/3065386": "ImageNet classification with deep convolutional neural networks",
|
| 173 |
+
"arXiv:2101.08700": "Multi-sense embeddings through a word sense disambiguation process",
|
| 174 |
+
"10.1145/3340531.3411878": "Incremental and parallel computation of structural graph summaries for evolving graphs",
|
| 175 |
}
|
| 176 |
|
| 177 |
example = st.selectbox(
|
|
|
|
| 182 |
|
| 183 |
user_aspect = st.radio(
|
| 184 |
label="In what aspect are you interested?",
|
| 185 |
+
options=aspects,
|
| 186 |
+
format_func=lambda option_key: aspect_labels[option_key],
|
| 187 |
)
|
| 188 |
|
| 189 |
cols = st.columns(3)
|