Add PDF Last Modified multiplier
This commit is contained in:
parent
302c8db00e
commit
511207e0b2
6 changed files with 61 additions and 15 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -4,4 +4,5 @@ orcinus/GeoIP2/GeoLite2-Country.mmdb
|
|||
orcinus/GeoIP2/geoip2.phar
|
||||
orcinus/GeoIP2/COPYRIGHT.txt
|
||||
orcinus/config.ini.php
|
||||
*.7z
|
||||
*.7z
|
||||
orcinus/js/offline-search.js
|
||||
|
|
|
@ -769,6 +769,9 @@ if (!$_SESSION['admin_username']) {
|
|||
if (!isset($_POST['os_s_weight_important'])) $_POST['os_s_weight_important'] = $_ODATA['s_weights']['important'];
|
||||
$_POST['os_s_weight_important'] = number_format(max(0, (float)$_POST['os_s_weight_important']), 1, '.', '');
|
||||
|
||||
if (!isset($_POST['os_s_weight_pdflastmod'])) $_POST['os_s_weight_pdflastmod'] = $_ODATA['s_weights']['pdflastmod'];
|
||||
$_POST['os_s_weight_pdflastmod'] = number_format(max(0.1, (float)$_POST['os_s_weight_pdflastmod']), 1, '.', '');
|
||||
|
||||
if (!isset($_POST['os_s_weight_css_value'])) $_POST['os_s_weight_css_value'] = $_ODATA['s_weights']['css_value'];
|
||||
$_POST['os_s_weight_css_value'] = number_format(max(0, (float)$_POST['os_s_weight_css_value']), 1, '.', '');
|
||||
|
||||
|
@ -780,7 +783,8 @@ if (!$_SESSION['admin_username']) {
|
|||
'css_value' => $_POST['os_s_weight_css_value'],
|
||||
'url' => $_POST['os_s_weight_url'],
|
||||
'multi' => $_POST['os_s_weight_multi'],
|
||||
'important' => $_POST['os_s_weight_important']
|
||||
'important' => $_POST['os_s_weight_important'],
|
||||
'pdflastmod' => $_POST['os_s_weight_pdflastmod']
|
||||
));
|
||||
|
||||
if (isset($_POST['os_s_weight_css'])) {
|
||||
|
@ -888,7 +892,7 @@ if (!$_SESSION['admin_username']) {
|
|||
// ***** Write to and download the Offline Javascript file
|
||||
$crawldata = $_DDATA['pdo']->query(
|
||||
'SELECT `url`, `title`, `description`, `keywords`, `category`,
|
||||
`content_mime`, `weighted`, `content`, `priority`
|
||||
`content_mime`, `weighted`, `content`, `last_modified`, `priority`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata`
|
||||
WHERE `flag_unlisted`=0 '.$_RDATA['s_show_orphans'].' AND
|
||||
`url` LIKE \''.addslashes($_ODATA['jw_hostname']).'/%\';'
|
||||
|
@ -2425,7 +2429,7 @@ if (!$_SESSION['admin_username']) {
|
|||
<h5 class="text-center">
|
||||
Multipliers
|
||||
<img src="img/help.svg" alt="Information" class="align-middle svg-icon mb-1"
|
||||
data-bs-toggle="tooltip" data-bs-placement="top" title="These values MULTIPLY the final relevance score for a search result. Should be greater than 1.0.">
|
||||
data-bs-toggle="tooltip" data-bs-placement="top" title="These values MULTIPLY the final relevance score for a search result.">
|
||||
</h5>
|
||||
<label class="d-flex lh-lg w-100 mb-2">
|
||||
<strong class="pe-2">Multi-term:</strong>
|
||||
|
@ -2441,6 +2445,18 @@ if (!$_SESSION['admin_username']) {
|
|||
data-bs-toggle="tooltip" data-bs-placement="bottom" title="Applied for search terms the user has marked as '+important' and "phrase matches". Default: 1.5">
|
||||
</span>
|
||||
</label>
|
||||
<label class="d-flex lh-lg w-100 mb-2">
|
||||
<strong class="pe-2">PDF Last Modified:</strong>
|
||||
<span class="flex-grow-1 text-end text-nowrap">
|
||||
<input type="number" name="os_s_weight_pdflastmod" value="<?php echo $_ODATA['s_weights']['pdflastmod']; ?>" min="0.1" max="10" step="0.1" class="form-control d-inline-block"
|
||||
data-bs-toggle="tooltip" data-bs-placement="bottom" title="Rank PDFs by examining their 'Last Modified' dates. Default: 1.0">
|
||||
</span>
|
||||
</label>
|
||||
<p id="os_s_weight_pdflastmod_text" class="form-text">
|
||||
The <em>PDF Last Modified</em> multiplier lets you rank older PDFs lower in
|
||||
search results based on years of age. <em>eg</em>. a value of 0.5 means a
|
||||
year-old PDF has its relevance value halved. Minimum value: 0.1
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
|
|
|
@ -196,7 +196,7 @@ if (!count($testConf->fetchAll())) {
|
|||
`s_results_pagination`=10,
|
||||
`s_limit_matchtext`=256,
|
||||
`s_limit_cache`=256,
|
||||
`s_weights`=\'{"title":"1.3","body":"0.5","keywords":"2.1","description":"0.4","css_value":"1.9","url":"0.2","multi":"2.5","important":"1.5"}\',
|
||||
`s_weights`=\'{"title":"1.3","body":"0.5","keywords":"2.1","description":"0.4","css_value":"1.9","url":"0.2","multi":"2.5","important":"1.5","pdflastmod":"1.0"}\',
|
||||
`s_weight_css`=\'.important dt h1 h2 h3\',
|
||||
`s_show_orphans`=0,
|
||||
`s_show_filetype_html`=0,
|
||||
|
|
|
@ -1415,7 +1415,8 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
|||
$getItems = array(
|
||||
'title' => array('Title', 'dc:title', 'pdf:title'),
|
||||
'description' => array('Subject', 'dc:description', 'pdf:subject'),
|
||||
'keywords' => array('Keywords', 'dc:subject', 'pdf:keywords')
|
||||
'keywords' => array('Keywords', 'dc:subject', 'pdf:keywords'),
|
||||
'modified' => array('SourceModified', 'pdfx:sourcemodified', 'CreationDate', 'xmp:createdate')
|
||||
);
|
||||
|
||||
foreach ($getItems as $key => $item) {
|
||||
|
@ -1443,6 +1444,10 @@ while ($_cURL && count($_RDATA['sp_queue'])) {
|
|||
if (!$data['info']['charset']) $data['info']['charset'] = 'CP1252';
|
||||
OS_cleanTextUTF8($data['content'], $data['info']['charset']);
|
||||
|
||||
if (!empty($data['modified']))
|
||||
if ($stamp = strtotime($data['modified']))
|
||||
$data['info']['filetime'] = $stamp;
|
||||
|
||||
if ($data['content']) {
|
||||
|
||||
// Discard the PDF text if it contains Unicode control
|
||||
|
|
|
@ -41,11 +41,12 @@ const os_params = new URLSearchParams(window.location.search);
|
|||
|
||||
|
||||
// ***** Page Object Constructor
|
||||
function os_page(content_mime, url, category, priority, title, description, keywords, weighted, content) {
|
||||
function os_page(content_mime, url, category, priority, last_modified, title, description, keywords, weighted, content) {
|
||||
this.content_mime = content_mime;
|
||||
this.url = url;
|
||||
this.category = category;
|
||||
this.priority = parseFloat(priority);
|
||||
this.last_modified = parseInt(last_modified);
|
||||
this.title = title;
|
||||
this.description = description;
|
||||
this.keywords = keywords;
|
||||
|
@ -62,7 +63,7 @@ function os_page(content_mime, url, category, priority, title, description, keyw
|
|||
// ***** Search Database
|
||||
let os_crawldata = [
|
||||
{{#os_crawldata}}
|
||||
new os_page('{{{content_mime}}}', '{{{url}}}', '{{{category}}}', {{priority}}, '{{{title}}}', '{{{description}}}', '{{{keywords}}}', '{{{weighted}}}', '{{{words}}}'),
|
||||
new os_page('{{{content_mime}}}', '{{{url}}}', '{{{category}}}', {{priority}}, {{last_modified}}, '{{{title}}}', '{{{description}}}', '{{{keywords}}}', '{{{weighted}}}', '{{{words}}}'),
|
||||
{{/os_crawldata}}
|
||||
];
|
||||
|
||||
|
@ -226,6 +227,7 @@ if (os_crawldata.length) {
|
|||
|
||||
|
||||
// ***** There is never any cache, so do an actual search
|
||||
let pdfList = [];
|
||||
for (let y = os_crawldata.length - 1; y >= 0; y--) {
|
||||
if (filetypes.length) {
|
||||
let allowMime = false;
|
||||
|
@ -237,6 +239,7 @@ if (os_crawldata.length) {
|
|||
}
|
||||
}
|
||||
|
||||
let addRelevance;
|
||||
for (let x = 0; x < os_sdata.terms.length; x++) {
|
||||
addRelevance = 0;
|
||||
|
||||
|
@ -274,20 +277,28 @@ if (os_crawldata.length) {
|
|||
|
||||
if (addRelevance) {
|
||||
os_crawldata[y].multi++;
|
||||
os_crawldata[y].relevance += addRelevance;
|
||||
} else if (os_sdata.terms[x][0] == 'phrase')
|
||||
os_crawldata.splice(y, 1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (addRelevance) {
|
||||
os_crawldata[y].relevance += addRelevance;
|
||||
if (os_crawldata[y].content_mime == 'application/pdf')
|
||||
pdfList.push([y, os_crawldata[y].last_modified]);
|
||||
|
||||
// Calculate multipliers
|
||||
os_crawldata[y].relevance *= Math.pow(os_odata.s_weights.multi, os_crawldata[y].multi);
|
||||
os_crawldata[y].relevance *= Math.pow(os_odata.s_weights.important, os_crawldata[y].phrase);
|
||||
// Calculate multipliers
|
||||
os_crawldata[y].relevance *= Math.pow(os_odata.s_weights.multi, os_crawldata[y].multi);
|
||||
os_crawldata[y].relevance *= Math.pow(os_odata.s_weights.important, os_crawldata[y].phrase);
|
||||
|
||||
os_crawldata[y].relevance *= os_crawldata[y].priority;
|
||||
os_crawldata[y].relevance *= os_crawldata[y].priority;
|
||||
}
|
||||
|
||||
// Apply the PDF Last Modified multiplier
|
||||
if (pdfList.length > 1) {
|
||||
for (let y = 0, diff; y < pdfList.length; y++) {
|
||||
diff = ((new Date()).getTime() / 1000 - pdfList[y][1]) / (60 * 60 * 24 * 365);
|
||||
os_crawldata[pdfList[y][0]].relevance *= os_odata.s_weights.pdflastmod ** diff;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -259,7 +259,7 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
// Begin building the basic query
|
||||
$searchSQL = '
|
||||
SELECT `url`, `category`, `content`, `content_mime`, `title`,
|
||||
`description`, `keywords`, `weighted`, `priority`
|
||||
`description`, `keywords`, `weighted`, `last_modified`, `priority`
|
||||
FROM `'.$_DDATA['tbprefix'].'crawldata`
|
||||
WHERE `flag_unlisted`=0 AND `priority`>0';
|
||||
|
||||
|
@ -349,6 +349,7 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
$err = $searchQuery->errorInfo();
|
||||
if ($err[0] == '00000') {
|
||||
$searchQuery = $searchQuery->fetchAll();
|
||||
$pdfList = array();
|
||||
|
||||
// Apply relevance to each listing and then sort
|
||||
foreach ($searchQuery as $key => $row) {
|
||||
|
@ -356,6 +357,9 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
$searchQuery[$key]['multi'] = -1;
|
||||
$searchQuery[$key]['phrase'] = 0;
|
||||
|
||||
if ($row['content_mime'] == 'application/pdf')
|
||||
$pdfList[] = array($key, $row['last_modified']);
|
||||
|
||||
// Lowercase values for easy compare
|
||||
$row['lc_content'] = strtolower($row['content']);
|
||||
$row['lc_url'] = strtolower($row['url']);
|
||||
|
@ -427,6 +431,14 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
$searchQuery[$key]['relevance'] *= $row['priority'];
|
||||
}
|
||||
|
||||
// Apply the PDF Last Modified multiplier
|
||||
if (count($pdfList) > 1) {
|
||||
foreach ($pdfList as $value) {
|
||||
$diff = (time() - $value[1]) / (60 * 60 * 24 * 365);
|
||||
$searchQuery[$value[0]]['relevance'] *= $_ODATA['s_weights']['pdflastmod'] ** $diff;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the list by relevance value
|
||||
usort($searchQuery, function($a, $b) {
|
||||
if ($b['relevance'] == $a['relevance']) return 0;
|
||||
|
@ -541,6 +553,7 @@ if ($_RDATA['s_searchable_pages']) {
|
|||
unset($_SDATA['results'][$key]['content']);
|
||||
unset($_SDATA['results'][$key]['keywords']);
|
||||
unset($_SDATA['results'][$key]['weighted']);
|
||||
unset($_SDATA['results'][$key]['last_modified']);
|
||||
unset($_SDATA['results'][$key]['multi']);
|
||||
unset($_SDATA['results'][$key]['phrase']);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue