1 سال پیش · 9c2d27fe05
--- a/crates/leechy-py/src/lib.rs
+++ b/crates/leechy-py/src/lib.rs
@@ -37,7 +37,7 @@ mod leechy {
 
				     #[pymethods]
			
 
				     impl Engine {
			
 
				         #[new]
			
 
				-        #[pyo3(signature = (name = "startpage"))]
			
 
				+        #[pyo3(signature = (name = "google"))]
			
 
				         fn new(name: &str) -> PyResult<Self> {
			
 
				             match lchy::Engine::by_name(name) {
			
 
				                 Some(engine) => Ok(Self { inner: engine }),
			
--- a/crates/leechy/src/conf.toml
+++ b/crates/leechy/src/conf.toml
@@ -2,3 +2,8 @@
 
				 name = "startpage"
			
 
				 search = "https://www.startpage.com/sp/search?q={query}&qsr=en_US"
			
 
				 xpath = "./div[contains(@class, 'result')]/a"
			
 
				+
			
 
				+[[engines]]
			
 
				+name = "google"
			
 
				+search = "https://www.google.com/search?q={query}&udm=14&hl=en-US"
			
 
				+xpath = "./div[contains(@jscontroller, 'SC7lYd')]/./a[contains(@jsname, 'UWckNb')]"
			
--- a/crates/optics/testcases/samples
+++ b/crates/optics/testcases/samples
@@ -1 +1 @@
 
				-Subproject commit 4e566ed2d34eba694e0453bc8aa1d6ce42d0bfa9
			
 
				+Subproject commit ee88073dcf0db0c1f5f9f1eaa432be9e50a97bce
			
--- a/ltr/auto_annotate.py
+++ b/ltr/auto_annotate.py
@@ -12,7 +12,7 @@ import stract
 
				 
			
 
				 ELO_K = 32
			
 
				 ELO_SCALE = 400
			
 
				-ELO_ROUNDS_MULT = 5
			
 
				+ELO_ROUNDS_MULT = 7
			
 
				 NUM_LABELS = 4
			
 
				 
			
 
				 PROMPT = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
			
@@ -21,6 +21,7 @@ You are a helpful, smart, kind, and efficient AI assistant. You always fulfill t
 
				 
			
 
				 Think about this step-by-step. You are a search engine evaluator and your task is to evaluate search results based on how well the result matches the query
			
 
				 You will be shown two results for each query and most choose which result is best for the users query. A good result most answer the users query and come from an authoritative source.
			
 
				+A good result should be in the same language as the query or in english.
			
 
				 To choose the best result, write "Best: RESULT_A" or "Best: RESULT_B". Before choosing the best result, you should first evaluate the relevance of each result to the query.
			
 
				 
			
 
				 Query: "{}"
			
@@ -59,25 +60,12 @@ np.random.shuffle(all_queries)
 
				 db = Db("data/auto-ranking-annotation.sqlite")
			
 
				 
			
 
				 for query in all_queries:
			
 
				-    if len(query) < 3:
			
 
				-        continue
			
 
				-
			
 
				-    # check if query has large percentage of non-alphanumeric characters
			
 
				-    if sum([c.isalnum() for c in query]) / len(query) < 0.5:
			
 
				-        continue
			
 
				-
			
 
				-    # only consider queries with at least two words
			
 
				-    if len(query.split()) < 2:
			
 
				-        continue
			
 
				-
			
 
				-    if len(query) > 100:
			
 
				-        continue
			
 
				-
			
 
				     db.add_query(query)
			
 
				 
			
 
				 
			
 
				 unannotated_queries = db.get_unannotated_queries()
			
 
				 
			
 
				+
			
 
				 def add_results(qid, query):
			
 
				     results = stract.search(query)
			
 
				     time.sleep(1)
			
@@ -103,6 +91,7 @@ def get_best(res):
 
				         return 0
			
 
				     return None
			
 
				 
			
 
				+
			
 
				 def elo_update(winner, loser, elo):
			
 
				     p_winner = 1 / (1 + 10 ** ((elo[loser] - elo[winner]) / ELO_SCALE))
			
 
				     p_loser = 1 - p_winner
			
@@ -120,11 +109,19 @@ for qid, query in tqdm(unannotated_queries.items()):
 
				     elo = {url: ELO_SCALE // 2 for url, _, _ in unnanotated_results}
			
 
				 
			
 
				     for _ in tqdm(range(0, ELO_ROUNDS_MULT * len(unnanotated_results))):
			
 
				-        (url_a, _, json_a), (url_b, _, json_b)= random.sample(unnanotated_results, 2)
			
 
				+        (url_a, _, json_a), (url_b, _, json_b) = random.sample(unnanotated_results, 2)
			
 
				 
			
 
				         webpage_a = json.loads(json_a)
			
 
				         webpage_b = json.loads(json_b)
			
 
				-        prompt = get_prompt(query, url_a, webpage_a["title"], webpage_a["snippet"], url_b, webpage_b["title"], webpage_b["snippet"])
			
 
				+        prompt = get_prompt(
			
 
				+            query,
			
 
				+            url_a,
			
 
				+            webpage_a["title"],
			
 
				+            webpage_a["snippet"],
			
 
				+            url_b,
			
 
				+            webpage_b["title"],
			
 
				+            webpage_b["snippet"],
			
 
				+        )
			
 
				         output = llm.create_completion(
			
 
				             prompt,
			
 
				             max_tokens=1024,
			
@@ -147,11 +144,15 @@ for qid, query in tqdm(unannotated_queries.items()):
 
				     elo = [{"url": url} for url, _ in elo]
			
 
				 
			
 
				     for i in range(len(elo)):
			
 
				-        elo[i]['label'] = NUM_LABELS - int(np.log2(i + 1))
			
 
				+        elo[i]["label"] = NUM_LABELS - int(np.log2(i + 1))
			
 
				+
			
 
				+    tqdm.write(query)
			
 
				+
			
 
				+    for website in elo:
			
 
				+        tqdm.write(f"{website['url']} - {website['label']}")
			
 
				+    tqdm.write("")
			
 
				 
			
 
				-    print(query)
			
 
				-    pprint(elo)
			
 
				     for website in elo:
			
 
				-        url = website['url']
			
 
				-        relevancy = website['label']
			
 
				+        url = website["url"]
			
 
				+        relevancy = website["label"]
			
 
				         db.annotate(qid, url, relevancy)
			
--- a/ltr/lambdamart.py
+++ b/ltr/lambdamart.py
@@ -13,13 +13,13 @@ param_grid = {
 
				     "verbosity": [-1],
			
 
				     "metric": ["ndcg"],
			
 
				     "ndcg_at": [[1, 2, 3, 5, 10]],
			
 
				-    "learning_rate": [0.003],
			
 
				+    # "learning_rate": [0.003],
			
 
				     # "num_iterations": [100],
			
 
				     # "max_depth": [-1, 2, 4, 8],
			
 
				     "max_depth": [-1],
			
 
				     # "num_leaves": [7, 15, 31],
			
 
				-    "num_leaves": [31],
			
 
				-    "lambda_l2": [1.0, 1.5, 2.0],
			
 
				+    "num_leaves": [63],
			
 
				+    "lambda_l2": [2.0, 2.5],
			
 
				     "linear_tree": [False],
			
 
				 }
			
 
				 
			
@@ -27,7 +27,8 @@ param_grid = {
 
				 accepted_queries = set()
			
 
				 with open("data/queries_us.csv") as f:
			
 
				     for query in f.readlines():
			
 
				-        accepted_queries.add(query.strip().lower())
			
 
				+        if len(query.strip()) > 1:
			
 
				+            accepted_queries.add(query.strip().lower())
			
 
				 
			
 
				 con = sqlite3.connect("data/auto-ranking-annotation.sqlite")
			
 
				 cur = con.cursor()
			
--- a/ltr/leechy_annotate.py
+++ b/ltr/leechy_annotate.py
@@ -9,23 +9,30 @@ import sys
 
				 NUM_LABELS = 4
			
 
				 
			
 
				 with open("data/queries_us.csv") as f:
			
 
				-    all_queries = [line.strip() for line in f.readlines()]
			
 
				+    all_queries = []
			
 
				+    for query in [line.strip() for line in f.readlines()]:
			
 
				+        if len(query) < 3:
			
 
				+            continue
			
 
				+
			
 
				+        # check if query has large percentage of non-alphanumeric characters
			
 
				+        if sum([c.isalnum() for c in query]) / len(query) < 0.5:
			
 
				+            continue
			
 
				+
			
 
				+        if len(query) > 100:
			
 
				+            continue
			
 
				+
			
 
				+        if len(query.split()) <= 1:
			
 
				+            continue
			
 
				+
			
 
				+        all_queries.append(query)
			
 
				 
			
 
				 # shuffle queries
			
 
				+
			
 
				 np.random.shuffle(all_queries)
			
 
				 
			
 
				 db = Db("data/auto-ranking-annotation.sqlite")
			
 
				 
			
 
				 for query in all_queries:
			
 
				-    if len(query) < 3:
			
 
				-        continue
			
 
				-
			
 
				-    # check if query has large percentage of non-alphanumeric characters
			
 
				-    if sum([c.isalnum() for c in query]) / len(query) < 0.5:
			
 
				-        continue
			
 
				-
			
 
				-    if len(query) > 100:
			
 
				-        continue
			
 
				 
			
 
				     db.add_query(query)