OGOGOG commited on
Commit
688bb5e
Β·
verified Β·
1 Parent(s): 14b3df5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -62
app.py CHANGED
@@ -68,37 +68,140 @@ GLASS_HINTS = {
68
  }
69
 
70
  # ========================
71
- # Robust extraction helpers
72
  # ========================
73
  def _clean(s):
74
  return s.strip() if isinstance(s, str) else ""
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def _split_ingredient_blob(s):
 
77
  if not isinstance(s, str):
78
  return []
79
  parts = re.split(r"[,\n;β€’\-–]+", s)
80
- return [p.strip().lower() for p in parts if p and len(p.strip()) > 1]
 
 
 
 
 
 
 
81
 
82
  def _from_list_of_pairs(val):
83
- out = []
 
84
  for x in val:
85
- if isinstance(x, (list, tuple)):
86
- cand = x[1] if len(x) > 1 else x[0]
87
- if isinstance(cand, str) and cand.strip():
88
- out.append(cand.strip().lower())
89
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def _from_list_of_dicts(val):
92
- out = []
 
 
 
 
 
93
  for x in val:
94
- if isinstance(x, dict):
95
- # try several keys commonly used
96
- for k in ["name", "ingredient", "item", "raw", "text", "strIngredient"]:
97
- cand = x.get(k)
98
- if isinstance(cand, str) and cand.strip():
99
- out.append(cand.strip().lower())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  break
101
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  def _get_title(row, cols):
104
  for k in ["title","name","cocktail_name","drink","Drink","strDrink"]:
@@ -106,43 +209,32 @@ def _get_title(row, cols):
106
  return _clean(row[k])
107
  return "Untitled"
108
 
109
- def _get_ingredients(row, cols):
110
- # 1) common tokenized field
 
 
 
111
  if "ingredient_tokens" in cols and row.get("ingredient_tokens"):
112
- vals = [str(x).strip().lower() for x in row["ingredient_tokens"] if str(x).strip()]
113
- if vals: return vals
114
-
115
- # 2) "ingredients" can be string, list of strings, list of pairs, list of dicts
116
- if "ingredients" in cols and row.get("ingredients") not in (None, "", [], {}):
117
- val = row["ingredients"]
118
- if isinstance(val, str):
119
- vals = _split_ingredient_blob(val)
120
- if vals: return vals
121
- elif isinstance(val, list):
122
- vals = [str(x).strip().lower() for x in val if isinstance(x, str) and x.strip()]
123
- if vals: return vals
124
- vals = _from_list_of_pairs(val)
125
- if vals: return vals
126
- vals = _from_list_of_dicts(val)
127
- if vals: return vals
128
-
129
- # 3) other possible field names
130
- for k in ["ingredients_raw","raw_ingredients","Raw_Ingredients","Raw Ingredients",
131
- "ingredient_list","ingredients_list"]:
132
- if k in cols and row.get(k) not in (None, "", [], {}):
133
- val = row[k]
134
- if isinstance(val, str):
135
- vals = _split_ingredient_blob(val)
136
- if vals: return vals
137
- elif isinstance(val, list):
138
- vals = [str(x).strip().lower() for x in val if isinstance(x, str) and x.strip()]
139
- if vals: return vals
140
- vals = _from_list_of_pairs(val)
141
- if vals: return vals
142
- vals = _from_list_of_dicts(val)
143
- if vals: return vals
144
-
145
- return []
146
 
147
  def _get_glass(row, cols, title, ingredients_text):
148
  # direct fields first
@@ -172,7 +264,7 @@ def tag_flavors(text):
172
  return tags
173
 
174
  # ========================
175
- # Load dataset and build docs
176
  # ========================
177
  ds = load_dataset(DATASET_ID, split="train", **load_kwargs)
178
  cols = ds.column_names
@@ -180,21 +272,27 @@ cols = ds.column_names
180
  DOCS = []
181
  for r in ds:
182
  title = _get_title(r, cols)
183
- ingredients = _get_ingredients(r, cols)
184
- ing_txt = ", ".join(ingredients)
185
- glass = _get_glass(r, cols, title, ing_txt)
186
- fused = f"{title}\nGlass: {glass}\nIngredients: {ing_txt}"
 
 
 
 
 
187
  DOCS.append({
188
  "title": title,
189
  "glass": glass,
190
- "ingredients": ingredients,
 
191
  "text": fused,
192
  "base": tag_base(fused),
193
  "flavors": tag_flavors(fused),
194
  })
195
 
196
  # ========================
197
- # Embeddings (title + glass + ingredients)
198
  # ========================
199
  encoder = SentenceTransformer(EMBED_MODEL)
200
  doc_embs = encoder.encode(
@@ -206,6 +304,12 @@ doc_embs = encoder.encode(
206
  # ========================
207
  # Recommendation (base filter + flavor boost)
208
  # ========================
 
 
 
 
 
 
209
  def recommend(base_alcohol, flavor, top_k=3):
210
  if base_alcohol not in BASE_OPTIONS:
211
  return "Please choose a base alcohol."
@@ -239,13 +343,14 @@ def recommend(base_alcohol, flavor, top_k=3):
239
  blocks = []
240
  for sc, i in picks:
241
  d = DOCS[i]
242
- ing_txt = ", ".join(d["ingredients"]) if d["ingredients"] else "β€”"
 
243
  meta = f"**Base:** {d['base']} | **Flavor tags:** {', '.join(d['flavors']) or 'β€”'} | **Score:** {sc:.3f}"
244
  blocks.append(
245
  f"### {d['title']}\n"
246
  f"**Glass:** {d['glass']} \n"
247
  f"{meta}\n\n"
248
- f"**Ingredients:** {ing_txt}"
249
  )
250
  return "\n\n---\n\n".join(blocks)
251
 
@@ -253,7 +358,7 @@ def recommend(base_alcohol, flavor, top_k=3):
253
  # UI
254
  # ========================
255
  with gr.Blocks() as demo:
256
- gr.Markdown("# 🍹 Cocktail Recommender β€” Base + Flavor (Ingredients + Glass)")
257
 
258
  with gr.Row():
259
  base = gr.Dropdown(choices=BASE_OPTIONS, value="gin", label="Base alcohol")
 
68
  }
69
 
70
  # ========================
71
+ # Robust extraction helpers (with measures)
72
  # ========================
73
  def _clean(s):
74
  return s.strip() if isinstance(s, str) else ""
75
 
76
+ def _norm_measure(s: str) -> str:
77
+ """Normalize common unit spacing/casing a bit."""
78
+ if not isinstance(s, str):
79
+ return ""
80
+ s = s.strip()
81
+ s = re.sub(r"\s+", " ", s)
82
+ # normalize ml/oz tsp tbsp dashes, etc.
83
+ s = re.sub(r"\bml\b", "ml", s, flags=re.I)
84
+ s = re.sub(r"\boz\b", "oz", s, flags=re.I)
85
+ s = re.sub(r"\btsp\b", "tsp", s, flags=re.I)
86
+ s = re.sub(r"\btbsp\b", "tbsp", s, flags=re.I)
87
+ return s
88
+
89
+ def _join_measure_name(measure, name):
90
+ m = _norm_measure(measure)
91
+ n = name.strip()
92
+ if m and n:
93
+ return f"{m} {n}"
94
+ return n or m
95
+
96
  def _split_ingredient_blob(s):
97
+ """Split a single string blob into lines. If it already contains amounts, keep them."""
98
  if not isinstance(s, str):
99
  return []
100
  parts = re.split(r"[,\n;β€’\-–]+", s)
101
+ # Keep original casing if looks like recipe lines; else lower()
102
+ out = []
103
+ for p in parts:
104
+ p = p.strip()
105
+ if not p:
106
+ continue
107
+ out.append(p)
108
+ return out
109
 
110
  def _from_list_of_pairs(val):
111
+ """[(measure, name), ...] or [(name, measure), ...] -> ['45 ml gin', ...]"""
112
+ out_disp, out_tokens = [], []
113
  for x in val:
114
+ if not isinstance(x, (list, tuple)) or len(x) == 0:
115
+ continue
116
+ if len(x) == 1:
117
+ name = str(x[0]).strip()
118
+ if name:
119
+ out_disp.append(name)
120
+ out_tokens.append(name.lower())
121
+ continue
122
+ a, b = str(x[0]).strip(), str(x[1]).strip()
123
+ # Heuristic: the one containing digits is likely the measure
124
+ if re.search(r"\d", a) and not re.search(r"\d", b):
125
+ disp = _join_measure_name(a, b)
126
+ out_disp.append(disp)
127
+ out_tokens.append(b.lower())
128
+ elif re.search(r"\d", b) and not re.search(r"\d", a):
129
+ disp = _join_measure_name(b, a)
130
+ out_disp.append(disp)
131
+ out_tokens.append(a.lower())
132
+ else:
133
+ # ambiguous; just join
134
+ disp = (a + " " + b).strip()
135
+ out_disp.append(disp)
136
+ # token = the more "wordy" one
137
+ token = b if len(b) > len(a) else a
138
+ out_tokens.append(token.lower())
139
+ return out_disp, out_tokens
140
 
141
  def _from_list_of_dicts(val):
142
+ """
143
+ [{"name": "gin", "measure": "45 ml"}, {"ingredient":"lime juice","qty":"15 ml"}]
144
+ -> ['45 ml gin', '15 ml lime juice']
145
+ Tokens -> ['gin','lime juice']
146
+ """
147
+ out_disp, out_tokens = [], []
148
  for x in val:
149
+ if not isinstance(x, dict):
150
+ continue
151
+ name = None
152
+ for nk in ["name", "ingredient", "item", "raw", "text", "strIngredient"]:
153
+ if isinstance(x.get(nk), str) and x[nk].strip():
154
+ name = x[nk].strip()
155
+ break
156
+ meas = None
157
+ for mk in ["measure", "qty", "quantity", "amount", "unit", "Measure", "strMeasure"]:
158
+ if isinstance(x.get(mk), str) and x[mk].strip():
159
+ meas = x[mk].strip()
160
+ break
161
+ if name and meas:
162
+ out_disp.append(_join_measure_name(meas, name))
163
+ out_tokens.append(name.lower())
164
+ elif name:
165
+ out_disp.append(name)
166
+ out_tokens.append(name.lower())
167
+ return out_disp, out_tokens
168
+
169
+ def _ingredients_from_any(val):
170
+ """
171
+ Return (display_list, token_list). Tries strings, list[str], list[pair], list[dict].
172
+ """
173
+ # String blob
174
+ if isinstance(val, str):
175
+ lines = _split_ingredient_blob(val)
176
+ # tokens: strip measures if present (take trailing words)
177
+ tokens = []
178
+ for line in lines:
179
+ # crude split: drop leading quantities like "45 ml"
180
+ parts = re.split(r"\s+", line)
181
+ # find first token with letters
182
+ idx = 0
183
+ for i, p in enumerate(parts):
184
+ if re.search(r"[A-Za-z]", p):
185
+ idx = i
186
  break
187
+ tokens.append(" ".join(parts[idx:]).lower())
188
+ return lines, tokens
189
+
190
+ # List of strings
191
+ if isinstance(val, list) and all(isinstance(x, str) for x in val):
192
+ disp = [x.strip() for x in val if x and x.strip()]
193
+ tokens = [x.lower().strip() for x in disp]
194
+ return disp, tokens
195
+
196
+ # List of pairs?
197
+ if isinstance(val, list) and any(isinstance(x, (list, tuple)) for x in val):
198
+ return _from_list_of_pairs(val)
199
+
200
+ # List of dicts?
201
+ if isinstance(val, list) and any(isinstance(x, dict) for x in val):
202
+ return _from_list_of_dicts(val)
203
+
204
+ return [], []
205
 
206
  def _get_title(row, cols):
207
  for k in ["title","name","cocktail_name","drink","Drink","strDrink"]:
 
209
  return _clean(row[k])
210
  return "Untitled"
211
 
212
+ def _get_ingredients_with_measures(row, cols):
213
+ """
214
+ Returns (ingredients_display: list[str], ingredient_tokens: list[str])
215
+ """
216
+ # 1) ingredient_tokens already present?
217
  if "ingredient_tokens" in cols and row.get("ingredient_tokens"):
218
+ toks = [str(x).strip().lower() for x in row["ingredient_tokens"] if str(x).strip()]
219
+ # If measures also exist in a parallel field, combine
220
+ for mkey in ["measure_tokens","measures","measure_list"]:
221
+ if mkey in cols and row.get(mkey) and isinstance(row[mkey], list) and len(row[mkey]) == len(toks):
222
+ disp = []
223
+ for m, n in zip(row[mkey], row["ingredient_tokens"]):
224
+ m = _norm_measure(str(m))
225
+ n = str(n).strip()
226
+ disp.append(_join_measure_name(m, n) if m else n)
227
+ return disp, toks
228
+ # Otherwise just display tokens (no measures available)
229
+ return toks, toks
230
+
231
+ # 2) A combined "ingredients" field (string/list/pairs/dicts)
232
+ for key in ["ingredients","ingredients_raw","raw_ingredients","Raw_Ingredients","Raw Ingredients",
233
+ "ingredient_list","ingredients_list"]:
234
+ if key in cols and row.get(key) not in (None, "", [], {}):
235
+ return _ingredients_from_any(row[key])
236
+
237
+ return [], []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  def _get_glass(row, cols, title, ingredients_text):
240
  # direct fields first
 
264
  return tags
265
 
266
  # ========================
267
+ # Load dataset & build docs
268
  # ========================
269
  ds = load_dataset(DATASET_ID, split="train", **load_kwargs)
270
  cols = ds.column_names
 
272
  DOCS = []
273
  for r in ds:
274
  title = _get_title(r, cols)
275
+ ing_disp, ing_tokens = _get_ingredients_with_measures(r, cols)
276
+ ing_disp = [x for x in ing_disp if x] # clean empties
277
+ ing_tokens = [x for x in ing_tokens if x]
278
+
279
+ ing_txt_for_glass = ", ".join(ing_disp) if ing_disp else ", ".join(ing_tokens)
280
+ glass = _get_glass(r, cols, title, ing_txt_for_glass)
281
+
282
+ # Text used for embeddings: title + tokens (no measures) + glass
283
+ fused = f"{title}\nGlass: {glass}\nIngredients: {', '.join(ing_tokens)}"
284
  DOCS.append({
285
  "title": title,
286
  "glass": glass,
287
+ "ingredients_display": ing_disp, # with measures when available
288
+ "ingredients_tokens": ing_tokens, # names only (for search)
289
  "text": fused,
290
  "base": tag_base(fused),
291
  "flavors": tag_flavors(fused),
292
  })
293
 
294
  # ========================
295
+ # Embeddings (title + glass + ingredient tokens)
296
  # ========================
297
  encoder = SentenceTransformer(EMBED_MODEL)
298
  doc_embs = encoder.encode(
 
304
  # ========================
305
  # Recommendation (base filter + flavor boost)
306
  # ========================
307
+ def _format_ingredients_markdown(lines):
308
+ if not lines:
309
+ return "β€”"
310
+ # bullet list for readability
311
+ return "\n".join([f"- {line}" for line in lines])
312
+
313
  def recommend(base_alcohol, flavor, top_k=3):
314
  if base_alcohol not in BASE_OPTIONS:
315
  return "Please choose a base alcohol."
 
343
  blocks = []
344
  for sc, i in picks:
345
  d = DOCS[i]
346
+ ing_lines = d["ingredients_display"] or d["ingredients_tokens"]
347
+ ing_md = _format_ingredients_markdown(ing_lines)
348
  meta = f"**Base:** {d['base']} | **Flavor tags:** {', '.join(d['flavors']) or 'β€”'} | **Score:** {sc:.3f}"
349
  blocks.append(
350
  f"### {d['title']}\n"
351
  f"**Glass:** {d['glass']} \n"
352
  f"{meta}\n\n"
353
+ f"**Ingredients:**\n{ing_md}"
354
  )
355
  return "\n\n---\n\n".join(blocks)
356
 
 
358
  # UI
359
  # ========================
360
  with gr.Blocks() as demo:
361
+ gr.Markdown("# 🍹 AI Bartender β€” Base + Flavor (Ingredients with Measures + Glass)")
362
 
363
  with gr.Row():
364
  base = gr.Dropdown(choices=BASE_OPTIONS, value="gin", label="Base alcohol")