Update app.py (#5)
Browse files- Update app.py (d03a66cdb5555dafa34555e5f7802e5c03892b1a)
Co-authored-by: Aslan Noorghasemi <[email protected]>
app.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
# Import dependencies
|
| 2 |
-
|
| 3 |
from langchain.prompts import PromptTemplate
|
| 4 |
from langchain.chains import LLMChain
|
| 5 |
from pyvis.network import Network
|
|
@@ -33,17 +31,14 @@ from peft import PeftModel, LoraConfig, get_peft_model, TaskType
|
|
| 33 |
|
| 34 |
# Setup
|
| 35 |
|
| 36 |
-
#token_public = ""
|
| 37 |
-
#login(token)
|
| 38 |
-
|
| 39 |
REPO_ID_NEAR_FIELD_RAW = "milistu/AMAZON-Products-2023"
|
| 40 |
REPO_ID_NEAR_FIELD = "aslan-ng/amazon_products_2023"
|
| 41 |
REPO_ID_FAR_FIELD = "aslan-ng/amazon_products_2025"
|
|
|
|
| 42 |
|
| 43 |
def product_quality_score(average_rating: float, rating_number: int):
|
| 44 |
"""
|
| 45 |
Bayesian Average (Amazon-style)
|
| 46 |
-
|
| 47 |
Args:
|
| 48 |
avg_rating: product's average rating
|
| 49 |
rating_number: number of reviews
|
|
@@ -54,10 +49,6 @@ def product_quality_score(average_rating: float, rating_number: int):
|
|
| 54 |
return C # fallback to global mean
|
| 55 |
return (rating_number / (rating_number + m)) * average_rating + (m / (rating_number + m)) * C
|
| 56 |
|
| 57 |
-
# Example
|
| 58 |
-
print("Product 1: ", product_quality_score(average_rating=4.25, rating_number=10000))
|
| 59 |
-
print("Product 2: ", product_quality_score(average_rating=5.0, rating_number=1))
|
| 60 |
-
|
| 61 |
def load_near_field_raw_from_huggingface():
|
| 62 |
"""
|
| 63 |
Load the raw near-field dataset from HuggingFace.
|
|
@@ -119,53 +110,7 @@ def load_near_field_from_huggingface():
|
|
| 119 |
ds = load_dataset(REPO_ID_NEAR_FIELD, split="train")
|
| 120 |
return ds.to_pandas()
|
| 121 |
|
| 122 |
-
def save_near_field_to_huggingface():
|
| 123 |
-
"""
|
| 124 |
-
Save the near-field dataset from HuggingFace.
|
| 125 |
-
"""
|
| 126 |
-
df = load_near_field_raw_from_huggingface()
|
| 127 |
-
ds = Dataset.from_pandas(df)
|
| 128 |
-
ds.push_to_hub(REPO_ID_NEAR_FIELD)
|
| 129 |
-
print(f"✅ Pushed {len(ds)} rows to {REPO_ID_NEAR_FIELD}")
|
| 130 |
-
|
| 131 |
-
#save_near_field_to_huggingface() # Run it once
|
| 132 |
dataset_near_field = load_near_field_from_huggingface()
|
| 133 |
-
print("Near-Field Length: ", len(dataset_near_field))
|
| 134 |
-
#print(dataset_near_field.head())
|
| 135 |
-
|
| 136 |
-
def load_far_field_from_sheet():
|
| 137 |
-
"""
|
| 138 |
-
Load the far-field dataset from Google Sheets.
|
| 139 |
-
"""
|
| 140 |
-
auth.authenticate_user()
|
| 141 |
-
from google.auth import default
|
| 142 |
-
COLS = ["title", "description", "average_rating", "rating_number"]
|
| 143 |
-
categories = ["Home & Kitchen", "Beauty & Personal Care", "Sports & Outdoors", "Clothing, Shoes & Jewelry", "Industrial & Scientific",
|
| 144 |
-
"Appliances", "Arts, Crafts & Sewing", "Electronics"]
|
| 145 |
-
sh = gspread.authorize(default()[0]).open_by_key(SHEET_ID_FAR_FIELD)
|
| 146 |
-
frames = []
|
| 147 |
-
for ws in sh.worksheets(): # iterate ALL sheets/tabs
|
| 148 |
-
rows = ws.get_all_records()
|
| 149 |
-
if not rows:
|
| 150 |
-
continue
|
| 151 |
-
df = pd.DataFrame(rows)
|
| 152 |
-
# Keep only the exact columns you want
|
| 153 |
-
df = df[COLS].copy()
|
| 154 |
-
# Add the tab name as main_category
|
| 155 |
-
df["main_category"] = ws.title
|
| 156 |
-
frames.append(df)
|
| 157 |
-
df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=COLS + ["main_category"])
|
| 158 |
-
|
| 159 |
-
# Add product quality score column
|
| 160 |
-
def _safe_pqs(row):
|
| 161 |
-
ar, n = row["average_rating"], row["rating_number"]
|
| 162 |
-
if pd.notna(ar) and pd.notna(n):
|
| 163 |
-
return product_quality_score(ar, n)
|
| 164 |
-
return float("nan")
|
| 165 |
-
|
| 166 |
-
df["product_quality_score"] = df.apply(_safe_pqs, axis=1)
|
| 167 |
-
|
| 168 |
-
return df
|
| 169 |
|
| 170 |
def load_far_field_from_huggingface():
|
| 171 |
"""
|
|
@@ -174,19 +119,7 @@ def load_far_field_from_huggingface():
|
|
| 174 |
ds = load_dataset(REPO_ID_FAR_FIELD, split="train")
|
| 175 |
return ds.to_pandas()
|
| 176 |
|
| 177 |
-
def save_far_field_to_huggingface():
|
| 178 |
-
"""
|
| 179 |
-
Save the far-field dataset from HuggingFace.
|
| 180 |
-
"""
|
| 181 |
-
df = load_far_field_from_sheet()
|
| 182 |
-
ds = Dataset.from_pandas(df)
|
| 183 |
-
ds.push_to_hub(REPO_ID_FAR_FIELD)
|
| 184 |
-
print(f"✅ Pushed {len(ds)} rows to {REPO_ID_FAR_FIELD}")
|
| 185 |
-
|
| 186 |
-
#save_far_field_to_huggingface() # Run it once
|
| 187 |
dataset_far_field = load_far_field_from_huggingface()
|
| 188 |
-
print("Far-Field Length: ",len(dataset_far_field))
|
| 189 |
-
#print(dataset_far_field.head())
|
| 190 |
|
| 191 |
def product_score(product_quality_score: float, fuzzy_score: float):
|
| 192 |
"""
|
|
@@ -194,10 +127,6 @@ def product_score(product_quality_score: float, fuzzy_score: float):
|
|
| 194 |
"""
|
| 195 |
return math.sqrt(product_quality_score * fuzzy_score)
|
| 196 |
|
| 197 |
-
# Example
|
| 198 |
-
print("Product 1: ", product_score(product_quality_score=3.2, fuzzy_score=100))
|
| 199 |
-
print("Product 2: ", product_score(product_quality_score=4.5, fuzzy_score=70))
|
| 200 |
-
|
| 201 |
def query_near_field(input: str, top_k: int=1):
|
| 202 |
"""
|
| 203 |
Return top_k fuzzy matches for query against dataset titles as a pandas DataFrame.
|
|
@@ -234,11 +163,6 @@ def query_near_field(input: str, top_k: int=1):
|
|
| 234 |
.reset_index(drop=True)
|
| 235 |
)
|
| 236 |
|
| 237 |
-
# Example
|
| 238 |
-
near_field_result = query_near_field("water bottle", top_k=5)
|
| 239 |
-
#print(near_field_result.head())
|
| 240 |
-
print("Example: ", near_field_result.iloc[0]["title"])
|
| 241 |
-
|
| 242 |
def query_far_field(input: str, top_k: int):
|
| 243 |
"""
|
| 244 |
Return top_k random elements from the far_field dataset as a pandas DataFrame.
|
|
@@ -267,12 +191,6 @@ def query_far_field(input: str, top_k: int):
|
|
| 267 |
|
| 268 |
return sampled
|
| 269 |
|
| 270 |
-
# Example usage
|
| 271 |
-
far_field_result = query_far_field("water bottle", top_k=3)
|
| 272 |
-
#print(far_field_result)
|
| 273 |
-
print("Top result title:", far_field_result.iloc[0]["title"])
|
| 274 |
-
#print("Top result title:", far_field_result)
|
| 275 |
-
|
| 276 |
def split_near_and_far_fields(total_examples: int, near_far_ratio: float = 0.5):
|
| 277 |
"""
|
| 278 |
Split the examples between near and far field.
|
|
@@ -290,9 +208,6 @@ def split_near_and_far_fields(total_examples: int, near_far_ratio: float = 0.5):
|
|
| 290 |
|
| 291 |
return near_field_examples, far_field_examples
|
| 292 |
|
| 293 |
-
# Example
|
| 294 |
-
print("Example: ", split_near_and_far_fields(total_examples=100, near_far_ratio=0.3)) # Expected: (30, 70)
|
| 295 |
-
|
| 296 |
def query(input: str, total_examples: int, near_far_ratio: float = 0.5):
|
| 297 |
near_field_examples, far_field_examples = split_near_and_far_fields(total_examples, near_far_ratio)
|
| 298 |
far_field_result = query_far_field(input, far_field_examples)
|
|
@@ -345,9 +260,7 @@ You are a product analyst. You'll receive product description as input, and extr
|
|
| 345 |
Product functionality refers to what the product does: its features, technical capabilities, and performance characteristics. It answers the question: “What can this product do?”
|
| 346 |
Product value refers to the benefit the customer gains from using the product: how it improves their life, solves their problem, or helps them achieve goals. It answers the question: “Why does this matter to the customer?”
|
| 347 |
Do **not** duplicate an item in both lists. Keep **functionalities** as concrete features. Keep **values** as clear user benefits.
|
| 348 |
-
|
| 349 |
Your Output is a dictionary. Here is the format:
|
| 350 |
-
|
| 351 |
# Your Input:
|
| 352 |
<product_description>
|
| 353 |
# Your Output:
|
|
@@ -363,7 +276,6 @@ Your Output is a dictionary. Here is the format:
|
|
| 363 |
...
|
| 364 |
]
|
| 365 |
}
|
| 366 |
-
|
| 367 |
Don't return anything out of the output format.
|
| 368 |
"""
|
| 369 |
|
|
@@ -595,4 +507,4 @@ demo = gr.Interface(
|
|
| 595 |
"""
|
| 596 |
)
|
| 597 |
|
| 598 |
-
demo.launch(quiet=True)
|
|
|
|
|
|
|
|
|
|
| 1 |
from langchain.prompts import PromptTemplate
|
| 2 |
from langchain.chains import LLMChain
|
| 3 |
from pyvis.network import Network
|
|
|
|
| 31 |
|
| 32 |
# Setup
|
| 33 |
|
|
|
|
|
|
|
|
|
|
| 34 |
REPO_ID_NEAR_FIELD_RAW = "milistu/AMAZON-Products-2023"
|
| 35 |
REPO_ID_NEAR_FIELD = "aslan-ng/amazon_products_2023"
|
| 36 |
REPO_ID_FAR_FIELD = "aslan-ng/amazon_products_2025"
|
| 37 |
+
REPO_ID_LORA_GREEN_PATENTS = "aslan-ng/lora-green-patents"
|
| 38 |
|
| 39 |
def product_quality_score(average_rating: float, rating_number: int):
|
| 40 |
"""
|
| 41 |
Bayesian Average (Amazon-style)
|
|
|
|
| 42 |
Args:
|
| 43 |
avg_rating: product's average rating
|
| 44 |
rating_number: number of reviews
|
|
|
|
| 49 |
return C # fallback to global mean
|
| 50 |
return (rating_number / (rating_number + m)) * average_rating + (m / (rating_number + m)) * C
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def load_near_field_raw_from_huggingface():
|
| 53 |
"""
|
| 54 |
Load the raw near-field dataset from HuggingFace.
|
|
|
|
| 110 |
ds = load_dataset(REPO_ID_NEAR_FIELD, split="train")
|
| 111 |
return ds.to_pandas()
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
dataset_near_field = load_near_field_from_huggingface()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
def load_far_field_from_huggingface():
|
| 116 |
"""
|
|
|
|
| 119 |
ds = load_dataset(REPO_ID_FAR_FIELD, split="train")
|
| 120 |
return ds.to_pandas()
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
dataset_far_field = load_far_field_from_huggingface()
|
|
|
|
|
|
|
| 123 |
|
| 124 |
def product_score(product_quality_score: float, fuzzy_score: float):
|
| 125 |
"""
|
|
|
|
| 127 |
"""
|
| 128 |
return math.sqrt(product_quality_score * fuzzy_score)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
def query_near_field(input: str, top_k: int=1):
|
| 131 |
"""
|
| 132 |
Return top_k fuzzy matches for query against dataset titles as a pandas DataFrame.
|
|
|
|
| 163 |
.reset_index(drop=True)
|
| 164 |
)
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
def query_far_field(input: str, top_k: int):
|
| 167 |
"""
|
| 168 |
Return top_k random elements from the far_field dataset as a pandas DataFrame.
|
|
|
|
| 191 |
|
| 192 |
return sampled
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
def split_near_and_far_fields(total_examples: int, near_far_ratio: float = 0.5):
|
| 195 |
"""
|
| 196 |
Split the examples between near and far field.
|
|
|
|
| 208 |
|
| 209 |
return near_field_examples, far_field_examples
|
| 210 |
|
|
|
|
|
|
|
|
|
|
| 211 |
def query(input: str, total_examples: int, near_far_ratio: float = 0.5):
|
| 212 |
near_field_examples, far_field_examples = split_near_and_far_fields(total_examples, near_far_ratio)
|
| 213 |
far_field_result = query_far_field(input, far_field_examples)
|
|
|
|
| 260 |
Product functionality refers to what the product does: its features, technical capabilities, and performance characteristics. It answers the question: “What can this product do?”
|
| 261 |
Product value refers to the benefit the customer gains from using the product: how it improves their life, solves their problem, or helps them achieve goals. It answers the question: “Why does this matter to the customer?”
|
| 262 |
Do **not** duplicate an item in both lists. Keep **functionalities** as concrete features. Keep **values** as clear user benefits.
|
|
|
|
| 263 |
Your Output is a dictionary. Here is the format:
|
|
|
|
| 264 |
# Your Input:
|
| 265 |
<product_description>
|
| 266 |
# Your Output:
|
|
|
|
| 276 |
...
|
| 277 |
]
|
| 278 |
}
|
|
|
|
| 279 |
Don't return anything out of the output format.
|
| 280 |
"""
|
| 281 |
|
|
|
|
| 507 |
"""
|
| 508 |
)
|
| 509 |
|
| 510 |
+
demo.launch(quiet=True)
|