Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -68,37 +68,140 @@ GLASS_HINTS = {
|
|
| 68 |
}
|
| 69 |
|
| 70 |
# ========================
|
| 71 |
-
# Robust extraction helpers
|
| 72 |
# ========================
|
| 73 |
def _clean(s):
|
| 74 |
return s.strip() if isinstance(s, str) else ""
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
def _split_ingredient_blob(s):
|
|
|
|
| 77 |
if not isinstance(s, str):
|
| 78 |
return []
|
| 79 |
parts = re.split(r"[,\n;β’\-β]+", s)
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
def _from_list_of_pairs(val):
|
| 83 |
-
|
|
|
|
| 84 |
for x in val:
|
| 85 |
-
if isinstance(x, (list, tuple)):
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
def _from_list_of_dicts(val):
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
for x in val:
|
| 94 |
-
if isinstance(x, dict):
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
break
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
def _get_title(row, cols):
|
| 104 |
for k in ["title","name","cocktail_name","drink","Drink","strDrink"]:
|
|
@@ -106,43 +209,32 @@ def _get_title(row, cols):
|
|
| 106 |
return _clean(row[k])
|
| 107 |
return "Untitled"
|
| 108 |
|
| 109 |
-
def
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
if "ingredient_tokens" in cols and row.get("ingredient_tokens"):
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
if k in cols and row.get(k) not in (None, "", [], {}):
|
| 133 |
-
val = row[k]
|
| 134 |
-
if isinstance(val, str):
|
| 135 |
-
vals = _split_ingredient_blob(val)
|
| 136 |
-
if vals: return vals
|
| 137 |
-
elif isinstance(val, list):
|
| 138 |
-
vals = [str(x).strip().lower() for x in val if isinstance(x, str) and x.strip()]
|
| 139 |
-
if vals: return vals
|
| 140 |
-
vals = _from_list_of_pairs(val)
|
| 141 |
-
if vals: return vals
|
| 142 |
-
vals = _from_list_of_dicts(val)
|
| 143 |
-
if vals: return vals
|
| 144 |
-
|
| 145 |
-
return []
|
| 146 |
|
| 147 |
def _get_glass(row, cols, title, ingredients_text):
|
| 148 |
# direct fields first
|
|
@@ -172,7 +264,7 @@ def tag_flavors(text):
|
|
| 172 |
return tags
|
| 173 |
|
| 174 |
# ========================
|
| 175 |
-
# Load dataset
|
| 176 |
# ========================
|
| 177 |
ds = load_dataset(DATASET_ID, split="train", **load_kwargs)
|
| 178 |
cols = ds.column_names
|
|
@@ -180,21 +272,27 @@ cols = ds.column_names
|
|
| 180 |
DOCS = []
|
| 181 |
for r in ds:
|
| 182 |
title = _get_title(r, cols)
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
DOCS.append({
|
| 188 |
"title": title,
|
| 189 |
"glass": glass,
|
| 190 |
-
"
|
|
|
|
| 191 |
"text": fused,
|
| 192 |
"base": tag_base(fused),
|
| 193 |
"flavors": tag_flavors(fused),
|
| 194 |
})
|
| 195 |
|
| 196 |
# ========================
|
| 197 |
-
# Embeddings (title + glass +
|
| 198 |
# ========================
|
| 199 |
encoder = SentenceTransformer(EMBED_MODEL)
|
| 200 |
doc_embs = encoder.encode(
|
|
@@ -206,6 +304,12 @@ doc_embs = encoder.encode(
|
|
| 206 |
# ========================
|
| 207 |
# Recommendation (base filter + flavor boost)
|
| 208 |
# ========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
def recommend(base_alcohol, flavor, top_k=3):
|
| 210 |
if base_alcohol not in BASE_OPTIONS:
|
| 211 |
return "Please choose a base alcohol."
|
|
@@ -239,13 +343,14 @@ def recommend(base_alcohol, flavor, top_k=3):
|
|
| 239 |
blocks = []
|
| 240 |
for sc, i in picks:
|
| 241 |
d = DOCS[i]
|
| 242 |
-
|
|
|
|
| 243 |
meta = f"**Base:** {d['base']} | **Flavor tags:** {', '.join(d['flavors']) or 'β'} | **Score:** {sc:.3f}"
|
| 244 |
blocks.append(
|
| 245 |
f"### {d['title']}\n"
|
| 246 |
f"**Glass:** {d['glass']} \n"
|
| 247 |
f"{meta}\n\n"
|
| 248 |
-
f"**Ingredients
|
| 249 |
)
|
| 250 |
return "\n\n---\n\n".join(blocks)
|
| 251 |
|
|
@@ -253,7 +358,7 @@ def recommend(base_alcohol, flavor, top_k=3):
|
|
| 253 |
# UI
|
| 254 |
# ========================
|
| 255 |
with gr.Blocks() as demo:
|
| 256 |
-
gr.Markdown("# πΉ
|
| 257 |
|
| 258 |
with gr.Row():
|
| 259 |
base = gr.Dropdown(choices=BASE_OPTIONS, value="gin", label="Base alcohol")
|
|
|
|
| 68 |
}
|
| 69 |
|
| 70 |
# ========================
|
| 71 |
+
# Robust extraction helpers (with measures)
|
| 72 |
# ========================
|
| 73 |
def _clean(s):
|
| 74 |
return s.strip() if isinstance(s, str) else ""
|
| 75 |
|
| 76 |
+
def _norm_measure(s: str) -> str:
|
| 77 |
+
"""Normalize common unit spacing/casing a bit."""
|
| 78 |
+
if not isinstance(s, str):
|
| 79 |
+
return ""
|
| 80 |
+
s = s.strip()
|
| 81 |
+
s = re.sub(r"\s+", " ", s)
|
| 82 |
+
# normalize ml/oz tsp tbsp dashes, etc.
|
| 83 |
+
s = re.sub(r"\bml\b", "ml", s, flags=re.I)
|
| 84 |
+
s = re.sub(r"\boz\b", "oz", s, flags=re.I)
|
| 85 |
+
s = re.sub(r"\btsp\b", "tsp", s, flags=re.I)
|
| 86 |
+
s = re.sub(r"\btbsp\b", "tbsp", s, flags=re.I)
|
| 87 |
+
return s
|
| 88 |
+
|
| 89 |
+
def _join_measure_name(measure, name):
|
| 90 |
+
m = _norm_measure(measure)
|
| 91 |
+
n = name.strip()
|
| 92 |
+
if m and n:
|
| 93 |
+
return f"{m} {n}"
|
| 94 |
+
return n or m
|
| 95 |
+
|
| 96 |
def _split_ingredient_blob(s):
|
| 97 |
+
"""Split a single string blob into lines. If it already contains amounts, keep them."""
|
| 98 |
if not isinstance(s, str):
|
| 99 |
return []
|
| 100 |
parts = re.split(r"[,\n;β’\-β]+", s)
|
| 101 |
+
# Keep original casing if looks like recipe lines; else lower()
|
| 102 |
+
out = []
|
| 103 |
+
for p in parts:
|
| 104 |
+
p = p.strip()
|
| 105 |
+
if not p:
|
| 106 |
+
continue
|
| 107 |
+
out.append(p)
|
| 108 |
+
return out
|
| 109 |
|
| 110 |
def _from_list_of_pairs(val):
|
| 111 |
+
"""[(measure, name), ...] or [(name, measure), ...] -> ['45 ml gin', ...]"""
|
| 112 |
+
out_disp, out_tokens = [], []
|
| 113 |
for x in val:
|
| 114 |
+
if not isinstance(x, (list, tuple)) or len(x) == 0:
|
| 115 |
+
continue
|
| 116 |
+
if len(x) == 1:
|
| 117 |
+
name = str(x[0]).strip()
|
| 118 |
+
if name:
|
| 119 |
+
out_disp.append(name)
|
| 120 |
+
out_tokens.append(name.lower())
|
| 121 |
+
continue
|
| 122 |
+
a, b = str(x[0]).strip(), str(x[1]).strip()
|
| 123 |
+
# Heuristic: the one containing digits is likely the measure
|
| 124 |
+
if re.search(r"\d", a) and not re.search(r"\d", b):
|
| 125 |
+
disp = _join_measure_name(a, b)
|
| 126 |
+
out_disp.append(disp)
|
| 127 |
+
out_tokens.append(b.lower())
|
| 128 |
+
elif re.search(r"\d", b) and not re.search(r"\d", a):
|
| 129 |
+
disp = _join_measure_name(b, a)
|
| 130 |
+
out_disp.append(disp)
|
| 131 |
+
out_tokens.append(a.lower())
|
| 132 |
+
else:
|
| 133 |
+
# ambiguous; just join
|
| 134 |
+
disp = (a + " " + b).strip()
|
| 135 |
+
out_disp.append(disp)
|
| 136 |
+
# token = the more "wordy" one
|
| 137 |
+
token = b if len(b) > len(a) else a
|
| 138 |
+
out_tokens.append(token.lower())
|
| 139 |
+
return out_disp, out_tokens
|
| 140 |
|
| 141 |
def _from_list_of_dicts(val):
|
| 142 |
+
"""
|
| 143 |
+
[{"name": "gin", "measure": "45 ml"}, {"ingredient":"lime juice","qty":"15 ml"}]
|
| 144 |
+
-> ['45 ml gin', '15 ml lime juice']
|
| 145 |
+
Tokens -> ['gin','lime juice']
|
| 146 |
+
"""
|
| 147 |
+
out_disp, out_tokens = [], []
|
| 148 |
for x in val:
|
| 149 |
+
if not isinstance(x, dict):
|
| 150 |
+
continue
|
| 151 |
+
name = None
|
| 152 |
+
for nk in ["name", "ingredient", "item", "raw", "text", "strIngredient"]:
|
| 153 |
+
if isinstance(x.get(nk), str) and x[nk].strip():
|
| 154 |
+
name = x[nk].strip()
|
| 155 |
+
break
|
| 156 |
+
meas = None
|
| 157 |
+
for mk in ["measure", "qty", "quantity", "amount", "unit", "Measure", "strMeasure"]:
|
| 158 |
+
if isinstance(x.get(mk), str) and x[mk].strip():
|
| 159 |
+
meas = x[mk].strip()
|
| 160 |
+
break
|
| 161 |
+
if name and meas:
|
| 162 |
+
out_disp.append(_join_measure_name(meas, name))
|
| 163 |
+
out_tokens.append(name.lower())
|
| 164 |
+
elif name:
|
| 165 |
+
out_disp.append(name)
|
| 166 |
+
out_tokens.append(name.lower())
|
| 167 |
+
return out_disp, out_tokens
|
| 168 |
+
|
| 169 |
+
def _ingredients_from_any(val):
|
| 170 |
+
"""
|
| 171 |
+
Return (display_list, token_list). Tries strings, list[str], list[pair], list[dict].
|
| 172 |
+
"""
|
| 173 |
+
# String blob
|
| 174 |
+
if isinstance(val, str):
|
| 175 |
+
lines = _split_ingredient_blob(val)
|
| 176 |
+
# tokens: strip measures if present (take trailing words)
|
| 177 |
+
tokens = []
|
| 178 |
+
for line in lines:
|
| 179 |
+
# crude split: drop leading quantities like "45 ml"
|
| 180 |
+
parts = re.split(r"\s+", line)
|
| 181 |
+
# find first token with letters
|
| 182 |
+
idx = 0
|
| 183 |
+
for i, p in enumerate(parts):
|
| 184 |
+
if re.search(r"[A-Za-z]", p):
|
| 185 |
+
idx = i
|
| 186 |
break
|
| 187 |
+
tokens.append(" ".join(parts[idx:]).lower())
|
| 188 |
+
return lines, tokens
|
| 189 |
+
|
| 190 |
+
# List of strings
|
| 191 |
+
if isinstance(val, list) and all(isinstance(x, str) for x in val):
|
| 192 |
+
disp = [x.strip() for x in val if x and x.strip()]
|
| 193 |
+
tokens = [x.lower().strip() for x in disp]
|
| 194 |
+
return disp, tokens
|
| 195 |
+
|
| 196 |
+
# List of pairs?
|
| 197 |
+
if isinstance(val, list) and any(isinstance(x, (list, tuple)) for x in val):
|
| 198 |
+
return _from_list_of_pairs(val)
|
| 199 |
+
|
| 200 |
+
# List of dicts?
|
| 201 |
+
if isinstance(val, list) and any(isinstance(x, dict) for x in val):
|
| 202 |
+
return _from_list_of_dicts(val)
|
| 203 |
+
|
| 204 |
+
return [], []
|
| 205 |
|
| 206 |
def _get_title(row, cols):
|
| 207 |
for k in ["title","name","cocktail_name","drink","Drink","strDrink"]:
|
|
|
|
| 209 |
return _clean(row[k])
|
| 210 |
return "Untitled"
|
| 211 |
|
| 212 |
+
def _get_ingredients_with_measures(row, cols):
|
| 213 |
+
"""
|
| 214 |
+
Returns (ingredients_display: list[str], ingredient_tokens: list[str])
|
| 215 |
+
"""
|
| 216 |
+
# 1) ingredient_tokens already present?
|
| 217 |
if "ingredient_tokens" in cols and row.get("ingredient_tokens"):
|
| 218 |
+
toks = [str(x).strip().lower() for x in row["ingredient_tokens"] if str(x).strip()]
|
| 219 |
+
# If measures also exist in a parallel field, combine
|
| 220 |
+
for mkey in ["measure_tokens","measures","measure_list"]:
|
| 221 |
+
if mkey in cols and row.get(mkey) and isinstance(row[mkey], list) and len(row[mkey]) == len(toks):
|
| 222 |
+
disp = []
|
| 223 |
+
for m, n in zip(row[mkey], row["ingredient_tokens"]):
|
| 224 |
+
m = _norm_measure(str(m))
|
| 225 |
+
n = str(n).strip()
|
| 226 |
+
disp.append(_join_measure_name(m, n) if m else n)
|
| 227 |
+
return disp, toks
|
| 228 |
+
# Otherwise just display tokens (no measures available)
|
| 229 |
+
return toks, toks
|
| 230 |
+
|
| 231 |
+
# 2) A combined "ingredients" field (string/list/pairs/dicts)
|
| 232 |
+
for key in ["ingredients","ingredients_raw","raw_ingredients","Raw_Ingredients","Raw Ingredients",
|
| 233 |
+
"ingredient_list","ingredients_list"]:
|
| 234 |
+
if key in cols and row.get(key) not in (None, "", [], {}):
|
| 235 |
+
return _ingredients_from_any(row[key])
|
| 236 |
+
|
| 237 |
+
return [], []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
def _get_glass(row, cols, title, ingredients_text):
|
| 240 |
# direct fields first
|
|
|
|
| 264 |
return tags
|
| 265 |
|
| 266 |
# ========================
|
| 267 |
+
# Load dataset & build docs
|
| 268 |
# ========================
|
| 269 |
ds = load_dataset(DATASET_ID, split="train", **load_kwargs)
|
| 270 |
cols = ds.column_names
|
|
|
|
| 272 |
DOCS = []
|
| 273 |
for r in ds:
|
| 274 |
title = _get_title(r, cols)
|
| 275 |
+
ing_disp, ing_tokens = _get_ingredients_with_measures(r, cols)
|
| 276 |
+
ing_disp = [x for x in ing_disp if x] # clean empties
|
| 277 |
+
ing_tokens = [x for x in ing_tokens if x]
|
| 278 |
+
|
| 279 |
+
ing_txt_for_glass = ", ".join(ing_disp) if ing_disp else ", ".join(ing_tokens)
|
| 280 |
+
glass = _get_glass(r, cols, title, ing_txt_for_glass)
|
| 281 |
+
|
| 282 |
+
# Text used for embeddings: title + tokens (no measures) + glass
|
| 283 |
+
fused = f"{title}\nGlass: {glass}\nIngredients: {', '.join(ing_tokens)}"
|
| 284 |
DOCS.append({
|
| 285 |
"title": title,
|
| 286 |
"glass": glass,
|
| 287 |
+
"ingredients_display": ing_disp, # with measures when available
|
| 288 |
+
"ingredients_tokens": ing_tokens, # names only (for search)
|
| 289 |
"text": fused,
|
| 290 |
"base": tag_base(fused),
|
| 291 |
"flavors": tag_flavors(fused),
|
| 292 |
})
|
| 293 |
|
| 294 |
# ========================
|
| 295 |
+
# Embeddings (title + glass + ingredient tokens)
|
| 296 |
# ========================
|
| 297 |
encoder = SentenceTransformer(EMBED_MODEL)
|
| 298 |
doc_embs = encoder.encode(
|
|
|
|
| 304 |
# ========================
|
| 305 |
# Recommendation (base filter + flavor boost)
|
| 306 |
# ========================
|
| 307 |
+
def _format_ingredients_markdown(lines):
|
| 308 |
+
if not lines:
|
| 309 |
+
return "β"
|
| 310 |
+
# bullet list for readability
|
| 311 |
+
return "\n".join([f"- {line}" for line in lines])
|
| 312 |
+
|
| 313 |
def recommend(base_alcohol, flavor, top_k=3):
|
| 314 |
if base_alcohol not in BASE_OPTIONS:
|
| 315 |
return "Please choose a base alcohol."
|
|
|
|
| 343 |
blocks = []
|
| 344 |
for sc, i in picks:
|
| 345 |
d = DOCS[i]
|
| 346 |
+
ing_lines = d["ingredients_display"] or d["ingredients_tokens"]
|
| 347 |
+
ing_md = _format_ingredients_markdown(ing_lines)
|
| 348 |
meta = f"**Base:** {d['base']} | **Flavor tags:** {', '.join(d['flavors']) or 'β'} | **Score:** {sc:.3f}"
|
| 349 |
blocks.append(
|
| 350 |
f"### {d['title']}\n"
|
| 351 |
f"**Glass:** {d['glass']} \n"
|
| 352 |
f"{meta}\n\n"
|
| 353 |
+
f"**Ingredients:**\n{ing_md}"
|
| 354 |
)
|
| 355 |
return "\n\n---\n\n".join(blocks)
|
| 356 |
|
|
|
|
| 358 |
# UI
|
| 359 |
# ========================
|
| 360 |
with gr.Blocks() as demo:
|
| 361 |
+
gr.Markdown("# πΉ AI Bartender β Base + Flavor (Ingredients with Measures + Glass)")
|
| 362 |
|
| 363 |
with gr.Row():
|
| 364 |
base = gr.Dropdown(choices=BASE_OPTIONS, value="gin", label="Base alcohol")
|