فهرست منبع

ltr experiment

Mikkel Denker 1 سال پیش
والد
کامیت
9c2d27fe05
6فایلهای تغییر یافته به همراه52 افزوده شده و 38 حذف شده
  1. 1 1
      crates/leechy-py/src/lib.rs
  2. 5 0
      crates/leechy/src/conf.toml
  3. 1 1
      crates/optics/testcases/samples
  4. 23 22
      ltr/auto_annotate.py
  5. 5 4
      ltr/lambdamart.py
  6. 17 10
      ltr/leechy_annotate.py

+ 1 - 1
crates/leechy-py/src/lib.rs

@@ -37,7 +37,7 @@ mod leechy {
     #[pymethods]
     impl Engine {
         #[new]
-        #[pyo3(signature = (name = "startpage"))]
+        #[pyo3(signature = (name = "google"))]
         fn new(name: &str) -> PyResult<Self> {
             match lchy::Engine::by_name(name) {
                 Some(engine) => Ok(Self { inner: engine }),

+ 5 - 0
crates/leechy/src/conf.toml

@@ -2,3 +2,8 @@
 name = "startpage"
 search = "https://www.startpage.com/sp/search?q={query}&qsr=en_US"
 xpath = "./div[contains(@class, 'result')]/a"
+
+[[engines]]
+name = "google"
+search = "https://www.google.com/search?q={query}&udm=14&hl=en-US"
+xpath = "./div[contains(@jscontroller, 'SC7lYd')]/./a[contains(@jsname, 'UWckNb')]"

+ 1 - 1
crates/optics/testcases/samples

@@ -1 +1 @@
-Subproject commit 4e566ed2d34eba694e0453bc8aa1d6ce42d0bfa9
+Subproject commit ee88073dcf0db0c1f5f9f1eaa432be9e50a97bce

+ 23 - 22
ltr/auto_annotate.py

@@ -12,7 +12,7 @@ import stract
 
 ELO_K = 32
 ELO_SCALE = 400
-ELO_ROUNDS_MULT = 5
+ELO_ROUNDS_MULT = 7
 NUM_LABELS = 4
 
 PROMPT = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
@@ -21,6 +21,7 @@ You are a helpful, smart, kind, and efficient AI assistant. You always fulfill t
 
 Think about this step-by-step. You are a search engine evaluator and your task is to evaluate search results based on how well the result matches the query
 You will be shown two results for each query and most choose which result is best for the users query. A good result most answer the users query and come from an authoritative source.
+A good result should be in the same language as the query or in english.
 To choose the best result, write "Best: RESULT_A" or "Best: RESULT_B". Before choosing the best result, you should first evaluate the relevance of each result to the query.
 
 Query: "{}"
@@ -59,25 +60,12 @@ np.random.shuffle(all_queries)
 db = Db("data/auto-ranking-annotation.sqlite")
 
 for query in all_queries:
-    if len(query) < 3:
-        continue
-
-    # check if query has large percentage of non-alphanumeric characters
-    if sum([c.isalnum() for c in query]) / len(query) < 0.5:
-        continue
-
-    # only consider queries with at least two words
-    if len(query.split()) < 2:
-        continue
-
-    if len(query) > 100:
-        continue
-
     db.add_query(query)
 
 
 unannotated_queries = db.get_unannotated_queries()
 
+
 def add_results(qid, query):
     results = stract.search(query)
     time.sleep(1)
@@ -103,6 +91,7 @@ def get_best(res):
         return 0
     return None
 
+
 def elo_update(winner, loser, elo):
     p_winner = 1 / (1 + 10 ** ((elo[loser] - elo[winner]) / ELO_SCALE))
     p_loser = 1 - p_winner
@@ -120,11 +109,19 @@ for qid, query in tqdm(unannotated_queries.items()):
     elo = {url: ELO_SCALE // 2 for url, _, _ in unnanotated_results}
 
     for _ in tqdm(range(0, ELO_ROUNDS_MULT * len(unnanotated_results))):
-        (url_a, _, json_a), (url_b, _, json_b)= random.sample(unnanotated_results, 2)
+        (url_a, _, json_a), (url_b, _, json_b) = random.sample(unnanotated_results, 2)
 
         webpage_a = json.loads(json_a)
         webpage_b = json.loads(json_b)
-        prompt = get_prompt(query, url_a, webpage_a["title"], webpage_a["snippet"], url_b, webpage_b["title"], webpage_b["snippet"])
+        prompt = get_prompt(
+            query,
+            url_a,
+            webpage_a["title"],
+            webpage_a["snippet"],
+            url_b,
+            webpage_b["title"],
+            webpage_b["snippet"],
+        )
         output = llm.create_completion(
             prompt,
             max_tokens=1024,
@@ -147,11 +144,15 @@ for qid, query in tqdm(unannotated_queries.items()):
     elo = [{"url": url} for url, _ in elo]
 
     for i in range(len(elo)):
-        elo[i]['label'] = NUM_LABELS - int(np.log2(i + 1))
+        elo[i]["label"] = NUM_LABELS - int(np.log2(i + 1))
+
+    tqdm.write(query)
+
+    for website in elo:
+        tqdm.write(f"{website['url']} - {website['label']}")
+    tqdm.write("")
 
-    print(query)
-    pprint(elo)
     for website in elo:
-        url = website['url']
-        relevancy = website['label']
+        url = website["url"]
+        relevancy = website["label"]
         db.annotate(qid, url, relevancy)

+ 5 - 4
ltr/lambdamart.py

@@ -13,13 +13,13 @@ param_grid = {
     "verbosity": [-1],
     "metric": ["ndcg"],
     "ndcg_at": [[1, 2, 3, 5, 10]],
-    "learning_rate": [0.003],
+    # "learning_rate": [0.003],
     # "num_iterations": [100],
     # "max_depth": [-1, 2, 4, 8],
     "max_depth": [-1],
     # "num_leaves": [7, 15, 31],
-    "num_leaves": [31],
-    "lambda_l2": [1.0, 1.5, 2.0],
+    "num_leaves": [63],
+    "lambda_l2": [2.0, 2.5],
     "linear_tree": [False],
 }
 
@@ -27,7 +27,8 @@ param_grid = {
 accepted_queries = set()
 with open("data/queries_us.csv") as f:
     for query in f.readlines():
-        accepted_queries.add(query.strip().lower())
+        if len(query.strip()) > 1:
+            accepted_queries.add(query.strip().lower())
 
 con = sqlite3.connect("data/auto-ranking-annotation.sqlite")
 cur = con.cursor()

+ 17 - 10
ltr/leechy_annotate.py

@@ -9,23 +9,30 @@ import sys
 NUM_LABELS = 4
 
 with open("data/queries_us.csv") as f:
-    all_queries = [line.strip() for line in f.readlines()]
+    all_queries = []
+    for query in [line.strip() for line in f.readlines()]:
+        if len(query) < 3:
+            continue
+
+        # check if query has large percentage of non-alphanumeric characters
+        if sum([c.isalnum() for c in query]) / len(query) < 0.5:
+            continue
+
+        if len(query) > 100:
+            continue
+
+        if len(query.split()) <= 1:
+            continue
+
+        all_queries.append(query)
 
 # shuffle queries
+
 np.random.shuffle(all_queries)
 
 db = Db("data/auto-ranking-annotation.sqlite")
 
 for query in all_queries:
-    if len(query) < 3:
-        continue
-
-    # check if query has large percentage of non-alphanumeric characters
-    if sum([c.isalnum() for c in query]) / len(query) < 0.5:
-        continue
-
-    if len(query) > 100:
-        continue
 
     db.add_query(query)