Spaces:

OGOGOG
/

Barternder-draft

Sleeping

App Files Files Community

OGOGOG commited on Aug 9

Commit

688bb5e

verified ·

1 Parent(s): 14b3df5

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -62

app.py CHANGED Viewed

@@ -68,37 +68,140 @@ GLASS_HINTS = {
 }
 # ========================
-# Robust extraction helpers
 # ========================
 def _clean(s):
     return s.strip() if isinstance(s, str) else ""
 def _split_ingredient_blob(s):
     if not isinstance(s, str):
         return []
     parts = re.split(r"[,\n;•\-–]+", s)
-    return [p.strip().lower() for p in parts if p and len(p.strip()) > 1]
 def _from_list_of_pairs(val):
-    out = []
     for x in val:
-        if isinstance(x, (list, tuple)):
-            cand = x[1] if len(x) > 1 else x[0]
-            if isinstance(cand, str) and cand.strip():
-                out.append(cand.strip().lower())
-    return out
 def _from_list_of_dicts(val):
-    out = []
     for x in val:
-        if isinstance(x, dict):
-            # try several keys commonly used
-            for k in ["name", "ingredient", "item", "raw", "text", "strIngredient"]:
-                cand = x.get(k)
-                if isinstance(cand, str) and cand.strip():
-                    out.append(cand.strip().lower())
                     break
-    return out
 def _get_title(row, cols):
     for k in ["title","name","cocktail_name","drink","Drink","strDrink"]:
@@ -106,43 +209,32 @@ def _get_title(row, cols):
             return _clean(row[k])
     return "Untitled"
-def _get_ingredients(row, cols):
-    # 1) common tokenized field
     if "ingredient_tokens" in cols and row.get("ingredient_tokens"):
-        vals = [str(x).strip().lower() for x in row["ingredient_tokens"] if str(x).strip()]
-        if vals: return vals
-    # 2) "ingredients" can be string, list of strings, list of pairs, list of dicts
-    if "ingredients" in cols and row.get("ingredients") not in (None, "", [], {}):
-        val = row["ingredients"]
-        if isinstance(val, str):
-            vals = _split_ingredient_blob(val)
-            if vals: return vals
-        elif isinstance(val, list):
-            vals = [str(x).strip().lower() for x in val if isinstance(x, str) and x.strip()]
-            if vals: return vals
-            vals = _from_list_of_pairs(val)
-            if vals: return vals
-            vals = _from_list_of_dicts(val)
-            if vals: return vals
-    # 3) other possible field names
-    for k in ["ingredients_raw","raw_ingredients","Raw_Ingredients","Raw Ingredients",
-              "ingredient_list","ingredients_list"]:
-        if k in cols and row.get(k) not in (None, "", [], {}):
-            val = row[k]
-            if isinstance(val, str):
-                vals = _split_ingredient_blob(val)
-                if vals: return vals
-            elif isinstance(val, list):
-                vals = [str(x).strip().lower() for x in val if isinstance(x, str) and x.strip()]
-                if vals: return vals
-                vals = _from_list_of_pairs(val)
-                if vals: return vals
-                vals = _from_list_of_dicts(val)
-                if vals: return vals
-    return []
 def _get_glass(row, cols, title, ingredients_text):
     # direct fields first
@@ -172,7 +264,7 @@ def tag_flavors(text):
     return tags
 # ========================
-# Load dataset and build docs
 # ========================
 ds = load_dataset(DATASET_ID, split="train", **load_kwargs)
 cols = ds.column_names
@@ -180,21 +272,27 @@ cols = ds.column_names
 DOCS = []
 for r in ds:
     title = _get_title(r, cols)
-    ingredients = _get_ingredients(r, cols)
-    ing_txt = ", ".join(ingredients)
-    glass = _get_glass(r, cols, title, ing_txt)
-    fused = f"{title}\nGlass: {glass}\nIngredients: {ing_txt}"
     DOCS.append({
         "title": title,
         "glass": glass,
-        "ingredients": ingredients,
         "text": fused,
         "base": tag_base(fused),
         "flavors": tag_flavors(fused),
     })
 # ========================
-# Embeddings (title + glass + ingredients)
 # ========================
 encoder = SentenceTransformer(EMBED_MODEL)
 doc_embs = encoder.encode(
@@ -206,6 +304,12 @@ doc_embs = encoder.encode(
 # ========================
 # Recommendation (base filter + flavor boost)
 # ========================
 def recommend(base_alcohol, flavor, top_k=3):
     if base_alcohol not in BASE_OPTIONS:
         return "Please choose a base alcohol."
@@ -239,13 +343,14 @@ def recommend(base_alcohol, flavor, top_k=3):
     blocks = []
     for sc, i in picks:
         d = DOCS[i]
-        ing_txt = ", ".join(d["ingredients"]) if d["ingredients"] else "—"
         meta = f"**Base:** {d['base']}  |  **Flavor tags:** {', '.join(d['flavors']) or '—'}  |  **Score:** {sc:.3f}"
         blocks.append(
             f"### {d['title']}\n"
             f"**Glass:** {d['glass']}  \n"
             f"{meta}\n\n"
-            f"**Ingredients:** {ing_txt}"
         )
     return "\n\n---\n\n".join(blocks)
@@ -253,7 +358,7 @@ def recommend(base_alcohol, flavor, top_k=3):
 # UI
 # ========================
 with gr.Blocks() as demo:
-    gr.Markdown("# 🍹 Cocktail Recommender — Base + Flavor (Ingredients + Glass)")
     with gr.Row():
         base = gr.Dropdown(choices=BASE_OPTIONS, value="gin", label="Base alcohol")

 }
 # ========================
+# Robust extraction helpers (with measures)
 # ========================
 def _clean(s):
     return s.strip() if isinstance(s, str) else ""
+def _norm_measure(s: str) -> str:
+    """Normalize common unit spacing/casing a bit."""
+    if not isinstance(s, str):
+        return ""
+    s = s.strip()
+    s = re.sub(r"\s+", " ", s)
+    # normalize ml/oz tsp tbsp dashes, etc.
+    s = re.sub(r"\bml\b", "ml", s, flags=re.I)
+    s = re.sub(r"\boz\b", "oz", s, flags=re.I)
+    s = re.sub(r"\btsp\b", "tsp", s, flags=re.I)
+    s = re.sub(r"\btbsp\b", "tbsp", s, flags=re.I)
+    return s
+def _join_measure_name(measure, name):
+    m = _norm_measure(measure)
+    n = name.strip()
+    if m and n:
+        return f"{m} {n}"
+    return n or m
 def _split_ingredient_blob(s):
+    """Split a single string blob into lines. If it already contains amounts, keep them."""
     if not isinstance(s, str):
         return []
     parts = re.split(r"[,\n;•\-–]+", s)
+    # Keep original casing if looks like recipe lines; else lower()
+    out = []
+    for p in parts:
+        p = p.strip()
+        if not p:
+            continue
+        out.append(p)
+    return out
 def _from_list_of_pairs(val):
+    """[(measure, name), ...] or [(name, measure), ...] -> ['45 ml gin', ...]"""
+    out_disp, out_tokens = [], []
     for x in val:
+        if not isinstance(x, (list, tuple)) or len(x) == 0:
+            continue
+        if len(x) == 1:
+            name = str(x[0]).strip()
+            if name:
+                out_disp.append(name)
+                out_tokens.append(name.lower())
+            continue
+        a, b = str(x[0]).strip(), str(x[1]).strip()
+        # Heuristic: the one containing digits is likely the measure
+        if re.search(r"\d", a) and not re.search(r"\d", b):
+            disp = _join_measure_name(a, b)
+            out_disp.append(disp)
+            out_tokens.append(b.lower())
+        elif re.search(r"\d", b) and not re.search(r"\d", a):
+            disp = _join_measure_name(b, a)
+            out_disp.append(disp)
+            out_tokens.append(a.lower())
+        else:
+            # ambiguous; just join
+            disp = (a + " " + b).strip()
+            out_disp.append(disp)
+            # token = the more "wordy" one
+            token = b if len(b) > len(a) else a
+            out_tokens.append(token.lower())
+    return out_disp, out_tokens
 def _from_list_of_dicts(val):
+    """
+    [{"name": "gin", "measure": "45 ml"}, {"ingredient":"lime juice","qty":"15 ml"}]
+    -> ['45 ml gin', '15 ml lime juice']
+    Tokens -> ['gin','lime juice']
+    """
+    out_disp, out_tokens = [], []
     for x in val:
+        if not isinstance(x, dict):
+            continue
+        name = None
+        for nk in ["name", "ingredient", "item", "raw", "text", "strIngredient"]:
+            if isinstance(x.get(nk), str) and x[nk].strip():
+                name = x[nk].strip()
+                break
+        meas = None
+        for mk in ["measure", "qty", "quantity", "amount", "unit", "Measure", "strMeasure"]:
+            if isinstance(x.get(mk), str) and x[mk].strip():
+                meas = x[mk].strip()
+                break
+        if name and meas:
+            out_disp.append(_join_measure_name(meas, name))
+            out_tokens.append(name.lower())
+        elif name:
+            out_disp.append(name)
+            out_tokens.append(name.lower())
+    return out_disp, out_tokens
+def _ingredients_from_any(val):
+    """
+    Return (display_list, token_list). Tries strings, list[str], list[pair], list[dict].
+    """
+    # String blob
+    if isinstance(val, str):
+        lines = _split_ingredient_blob(val)
+        # tokens: strip measures if present (take trailing words)
+        tokens = []
+        for line in lines:
+            # crude split: drop leading quantities like "45 ml"
+            parts = re.split(r"\s+", line)
+            # find first token with letters
+            idx = 0
+            for i, p in enumerate(parts):
+                if re.search(r"[A-Za-z]", p):
+                    idx = i
                     break
+            tokens.append(" ".join(parts[idx:]).lower())
+        return lines, tokens
+    # List of strings
+    if isinstance(val, list) and all(isinstance(x, str) for x in val):
+        disp = [x.strip() for x in val if x and x.strip()]
+        tokens = [x.lower().strip() for x in disp]
+        return disp, tokens
+    # List of pairs?
+    if isinstance(val, list) and any(isinstance(x, (list, tuple)) for x in val):
+        return _from_list_of_pairs(val)
+    # List of dicts?
+    if isinstance(val, list) and any(isinstance(x, dict) for x in val):
+        return _from_list_of_dicts(val)
+    return [], []
 def _get_title(row, cols):
     for k in ["title","name","cocktail_name","drink","Drink","strDrink"]:
             return _clean(row[k])
     return "Untitled"
+def _get_ingredients_with_measures(row, cols):
+    """
+    Returns (ingredients_display: list[str], ingredient_tokens: list[str])
+    """
+    # 1) ingredient_tokens already present?
     if "ingredient_tokens" in cols and row.get("ingredient_tokens"):
+        toks = [str(x).strip().lower() for x in row["ingredient_tokens"] if str(x).strip()]
+        # If measures also exist in a parallel field, combine
+        for mkey in ["measure_tokens","measures","measure_list"]:
+            if mkey in cols and row.get(mkey) and isinstance(row[mkey], list) and len(row[mkey]) == len(toks):
+                disp = []
+                for m, n in zip(row[mkey], row["ingredient_tokens"]):
+                    m = _norm_measure(str(m))
+                    n = str(n).strip()
+                    disp.append(_join_measure_name(m, n) if m else n)
+                return disp, toks
+        # Otherwise just display tokens (no measures available)
+        return toks, toks
+    # 2) A combined "ingredients" field (string/list/pairs/dicts)
+    for key in ["ingredients","ingredients_raw","raw_ingredients","Raw_Ingredients","Raw Ingredients",
+                "ingredient_list","ingredients_list"]:
+        if key in cols and row.get(key) not in (None, "", [], {}):
+            return _ingredients_from_any(row[key])
+    return [], []
 def _get_glass(row, cols, title, ingredients_text):
     # direct fields first
     return tags
 # ========================
+# Load dataset & build docs
 # ========================
 ds = load_dataset(DATASET_ID, split="train", **load_kwargs)
 cols = ds.column_names
 DOCS = []
 for r in ds:
     title = _get_title(r, cols)
+    ing_disp, ing_tokens = _get_ingredients_with_measures(r, cols)
+    ing_disp = [x for x in ing_disp if x]  # clean empties
+    ing_tokens = [x for x in ing_tokens if x]
+    ing_txt_for_glass = ", ".join(ing_disp) if ing_disp else ", ".join(ing_tokens)
+    glass = _get_glass(r, cols, title, ing_txt_for_glass)
+    # Text used for embeddings: title + tokens (no measures) + glass
+    fused = f"{title}\nGlass: {glass}\nIngredients: {', '.join(ing_tokens)}"
     DOCS.append({
         "title": title,
         "glass": glass,
+        "ingredients_display": ing_disp,   # with measures when available
+        "ingredients_tokens": ing_tokens,  # names only (for search)
         "text": fused,
         "base": tag_base(fused),
         "flavors": tag_flavors(fused),
     })
 # ========================
+# Embeddings (title + glass + ingredient tokens)
 # ========================
 encoder = SentenceTransformer(EMBED_MODEL)
 doc_embs = encoder.encode(
 # ========================
 # Recommendation (base filter + flavor boost)
 # ========================
+def _format_ingredients_markdown(lines):
+    if not lines:
+        return "—"
+    # bullet list for readability
+    return "\n".join([f"- {line}" for line in lines])
 def recommend(base_alcohol, flavor, top_k=3):
     if base_alcohol not in BASE_OPTIONS:
         return "Please choose a base alcohol."
     blocks = []
     for sc, i in picks:
         d = DOCS[i]
+        ing_lines = d["ingredients_display"] or d["ingredients_tokens"]
+        ing_md = _format_ingredients_markdown(ing_lines)
         meta = f"**Base:** {d['base']}  |  **Flavor tags:** {', '.join(d['flavors']) or '—'}  |  **Score:** {sc:.3f}"
         blocks.append(
             f"### {d['title']}\n"
             f"**Glass:** {d['glass']}  \n"
             f"{meta}\n\n"
+            f"**Ingredients:**\n{ing_md}"
         )
     return "\n\n---\n\n".join(blocks)
 # UI
 # ========================
 with gr.Blocks() as demo:
+    gr.Markdown("# 🍹 AI Bartender — Base + Flavor (Ingredients with Measures + Glass)")
     with gr.Row():
         base = gr.Dropdown(choices=BASE_OPTIONS, value="gin", label="Base alcohol")