Add PDF Last Modified multiplier

This commit is contained in:
Brian Huisman 2023-09-08 15:11:27 -04:00
parent 302c8db00e
commit 511207e0b2
6 changed files with 61 additions and 15 deletions

3
.gitignore vendored
View file

@ -4,4 +4,5 @@ orcinus/GeoIP2/GeoLite2-Country.mmdb
orcinus/GeoIP2/geoip2.phar
orcinus/GeoIP2/COPYRIGHT.txt
orcinus/config.ini.php
*.7z
*.7z
orcinus/js/offline-search.js

View file

@ -769,6 +769,9 @@ if (!$_SESSION['admin_username']) {
if (!isset($_POST['os_s_weight_important'])) $_POST['os_s_weight_important'] = $_ODATA['s_weights']['important'];
$_POST['os_s_weight_important'] = number_format(max(0, (float)$_POST['os_s_weight_important']), 1, '.', '');
if (!isset($_POST['os_s_weight_pdflastmod'])) $_POST['os_s_weight_pdflastmod'] = $_ODATA['s_weights']['pdflastmod'];
$_POST['os_s_weight_pdflastmod'] = number_format(max(0.1, (float)$_POST['os_s_weight_pdflastmod']), 1, '.', '');
if (!isset($_POST['os_s_weight_css_value'])) $_POST['os_s_weight_css_value'] = $_ODATA['s_weights']['css_value'];
$_POST['os_s_weight_css_value'] = number_format(max(0, (float)$_POST['os_s_weight_css_value']), 1, '.', '');
@ -780,7 +783,8 @@ if (!$_SESSION['admin_username']) {
'css_value' => $_POST['os_s_weight_css_value'],
'url' => $_POST['os_s_weight_url'],
'multi' => $_POST['os_s_weight_multi'],
'important' => $_POST['os_s_weight_important']
'important' => $_POST['os_s_weight_important'],
'pdflastmod' => $_POST['os_s_weight_pdflastmod']
));
if (isset($_POST['os_s_weight_css'])) {
@ -888,7 +892,7 @@ if (!$_SESSION['admin_username']) {
// ***** Write to and download the Offline Javascript file
$crawldata = $_DDATA['pdo']->query(
'SELECT `url`, `title`, `description`, `keywords`, `category`,
`content_mime`, `weighted`, `content`, `priority`
`content_mime`, `weighted`, `content`, `last_modified`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata`
WHERE `flag_unlisted`=0 '.$_RDATA['s_show_orphans'].' AND
`url` LIKE \''.addslashes($_ODATA['jw_hostname']).'/%\';'
@ -2425,7 +2429,7 @@ if (!$_SESSION['admin_username']) {
<h5 class="text-center">
Multipliers
<img src="img/help.svg" alt="Information" class="align-middle svg-icon mb-1"
data-bs-toggle="tooltip" data-bs-placement="top" title="These values MULTIPLY the final relevance score for a search result. Should be greater than 1.0.">
data-bs-toggle="tooltip" data-bs-placement="top" title="These values MULTIPLY the final relevance score for a search result.">
</h5>
<label class="d-flex lh-lg w-100 mb-2">
<strong class="pe-2">Multi-term:</strong>
@ -2441,6 +2445,18 @@ if (!$_SESSION['admin_username']) {
data-bs-toggle="tooltip" data-bs-placement="bottom" title="Applied for search terms the user has marked as '+important' and &quot;phrase matches&quot;. Default: 1.5">
</span>
</label>
<label class="d-flex lh-lg w-100 mb-2">
<strong class="pe-2">PDF Last Modified:</strong>
<span class="flex-grow-1 text-end text-nowrap">
<input type="number" name="os_s_weight_pdflastmod" value="<?php echo $_ODATA['s_weights']['pdflastmod']; ?>" min="0.1" max="10" step="0.1" class="form-control d-inline-block"
data-bs-toggle="tooltip" data-bs-placement="bottom" title="Rank PDFs by examining their 'Last Modified' dates. Default: 1.0">
</span>
</label>
<p id="os_s_weight_pdflastmod_text" class="form-text">
The <em>PDF Last Modified</em> multiplier lets you rank older PDFs lower in
search results based on years of age. <em>eg</em>. a value of 0.5 means a
year-old PDF has its relevance value halved. Minimum value: 0.1
</p>
</div>
</div>
</li>

View file

@ -196,7 +196,7 @@ if (!count($testConf->fetchAll())) {
`s_results_pagination`=10,
`s_limit_matchtext`=256,
`s_limit_cache`=256,
`s_weights`=\'{"title":"1.3","body":"0.5","keywords":"2.1","description":"0.4","css_value":"1.9","url":"0.2","multi":"2.5","important":"1.5"}\',
`s_weights`=\'{"title":"1.3","body":"0.5","keywords":"2.1","description":"0.4","css_value":"1.9","url":"0.2","multi":"2.5","important":"1.5","pdflastmod":"1.0"}\',
`s_weight_css`=\'.important dt h1 h2 h3\',
`s_show_orphans`=0,
`s_show_filetype_html`=0,

View file

@ -1415,7 +1415,8 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
$getItems = array(
'title' => array('Title', 'dc:title', 'pdf:title'),
'description' => array('Subject', 'dc:description', 'pdf:subject'),
'keywords' => array('Keywords', 'dc:subject', 'pdf:keywords')
'keywords' => array('Keywords', 'dc:subject', 'pdf:keywords'),
'modified' => array('SourceModified', 'pdfx:sourcemodified', 'CreationDate', 'xmp:createdate')
);
foreach ($getItems as $key => $item) {
@ -1443,6 +1444,10 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
if (!$data['info']['charset']) $data['info']['charset'] = 'CP1252';
OS_cleanTextUTF8($data['content'], $data['info']['charset']);
if (!empty($data['modified']))
if ($stamp = strtotime($data['modified']))
$data['info']['filetime'] = $stamp;
if ($data['content']) {
// Discard the PDF text if it contains Unicode control

View file

@ -41,11 +41,12 @@ const os_params = new URLSearchParams(window.location.search);
// ***** Page Object Constructor
function os_page(content_mime, url, category, priority, title, description, keywords, weighted, content) {
function os_page(content_mime, url, category, priority, last_modified, title, description, keywords, weighted, content) {
this.content_mime = content_mime;
this.url = url;
this.category = category;
this.priority = parseFloat(priority);
this.last_modified = parseInt(last_modified);
this.title = title;
this.description = description;
this.keywords = keywords;
@ -62,7 +63,7 @@ function os_page(content_mime, url, category, priority, title, description, keyw
// ***** Search Database
let os_crawldata = [
{{#os_crawldata}}
new os_page('{{{content_mime}}}', '{{{url}}}', '{{{category}}}', {{priority}}, '{{{title}}}', '{{{description}}}', '{{{keywords}}}', '{{{weighted}}}', '{{{words}}}'),
new os_page('{{{content_mime}}}', '{{{url}}}', '{{{category}}}', {{priority}}, {{last_modified}}, '{{{title}}}', '{{{description}}}', '{{{keywords}}}', '{{{weighted}}}', '{{{words}}}'),
{{/os_crawldata}}
];
@ -226,6 +227,7 @@ if (os_crawldata.length) {
// ***** There is never any cache, so do an actual search
let pdfList = [];
for (let y = os_crawldata.length - 1; y >= 0; y--) {
if (filetypes.length) {
let allowMime = false;
@ -237,6 +239,7 @@ if (os_crawldata.length) {
}
}
let addRelevance;
for (let x = 0; x < os_sdata.terms.length; x++) {
addRelevance = 0;
@ -274,20 +277,28 @@ if (os_crawldata.length) {
if (addRelevance) {
os_crawldata[y].multi++;
os_crawldata[y].relevance += addRelevance;
} else if (os_sdata.terms[x][0] == 'phrase')
os_crawldata.splice(y, 1);
}
}
if (addRelevance) {
os_crawldata[y].relevance += addRelevance;
if (os_crawldata[y].content_mime == 'application/pdf')
pdfList.push([y, os_crawldata[y].last_modified]);
// Calculate multipliers
os_crawldata[y].relevance *= Math.pow(os_odata.s_weights.multi, os_crawldata[y].multi);
os_crawldata[y].relevance *= Math.pow(os_odata.s_weights.important, os_crawldata[y].phrase);
// Calculate multipliers
os_crawldata[y].relevance *= Math.pow(os_odata.s_weights.multi, os_crawldata[y].multi);
os_crawldata[y].relevance *= Math.pow(os_odata.s_weights.important, os_crawldata[y].phrase);
os_crawldata[y].relevance *= os_crawldata[y].priority;
os_crawldata[y].relevance *= os_crawldata[y].priority;
}
// Apply the PDF Last Modified multiplier
if (pdfList.length > 1) {
for (let y = 0, diff; y < pdfList.length; y++) {
diff = ((new Date()).getTime() / 1000 - pdfList[y][1]) / (60 * 60 * 24 * 365);
os_crawldata[pdfList[y][0]].relevance *= os_odata.s_weights.pdflastmod ** diff;
}
}

View file

@ -259,7 +259,7 @@ if ($_RDATA['s_searchable_pages']) {
// Begin building the basic query
$searchSQL = '
SELECT `url`, `category`, `content`, `content_mime`, `title`,
`description`, `keywords`, `weighted`, `priority`
`description`, `keywords`, `weighted`, `last_modified`, `priority`
FROM `'.$_DDATA['tbprefix'].'crawldata`
WHERE `flag_unlisted`=0 AND `priority`>0';
@ -349,6 +349,7 @@ if ($_RDATA['s_searchable_pages']) {
$err = $searchQuery->errorInfo();
if ($err[0] == '00000') {
$searchQuery = $searchQuery->fetchAll();
$pdfList = array();
// Apply relevance to each listing and then sort
foreach ($searchQuery as $key => $row) {
@ -356,6 +357,9 @@ if ($_RDATA['s_searchable_pages']) {
$searchQuery[$key]['multi'] = -1;
$searchQuery[$key]['phrase'] = 0;
if ($row['content_mime'] == 'application/pdf')
$pdfList[] = array($key, $row['last_modified']);
// Lowercase values for easy compare
$row['lc_content'] = strtolower($row['content']);
$row['lc_url'] = strtolower($row['url']);
@ -427,6 +431,14 @@ if ($_RDATA['s_searchable_pages']) {
$searchQuery[$key]['relevance'] *= $row['priority'];
}
// Apply the PDF Last Modified multiplier
if (count($pdfList) > 1) {
foreach ($pdfList as $value) {
$diff = (time() - $value[1]) / (60 * 60 * 24 * 365);
$searchQuery[$value[0]]['relevance'] *= $_ODATA['s_weights']['pdflastmod'] ** $diff;
}
}
// Sort the list by relevance value
usort($searchQuery, function($a, $b) {
if ($b['relevance'] == $a['relevance']) return 0;
@ -541,6 +553,7 @@ if ($_RDATA['s_searchable_pages']) {
unset($_SDATA['results'][$key]['content']);
unset($_SDATA['results'][$key]['keywords']);
unset($_SDATA['results'][$key]['weighted']);
unset($_SDATA['results'][$key]['last_modified']);
unset($_SDATA['results'][$key]['multi']);
unset($_SDATA['results'][$key]['phrase']);
}