fix #2019 last_modified metadata support

This commit is contained in:
Shinsuke Sugaya 2019-02-14 12:29:26 +09:00
parent 8eadcbb48b
commit d38ce5982a
6 changed files with 95 additions and 16 deletions

View file

@ -158,6 +158,8 @@ public class Constants extends CoreLibConstants {
public static final String ISO_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
public static final String DATE_OPTIONAL_TIME = "date_optional_time";
public static final int DONE_STATUS = 9999;
public static final String DEFAULT_IGNORE_FAILURE_TYPE = StringUtil.EMPTY;

View file

@ -19,7 +19,6 @@ import static org.codelibs.core.stream.StreamUtil.stream;
import java.io.InputStream;
import java.net.URLDecoder;
import java.time.temporal.TemporalAccessor;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
@ -55,8 +54,8 @@ import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.PermissionHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.taglib.FessFunctions;
import org.codelibs.fess.util.ComponentUtil;
import org.elasticsearch.common.joda.Joda;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -140,10 +139,15 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
} else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getValue2())) {
dataMap.put(mapping.getValue1(), Double.parseDouble(values[0]));
} else if (Constants.MAPPING_TYPE_DATE.equalsIgnoreCase(mapping.getValue2())) {
final String format =
StringUtil.isNotBlank(mapping.getValue3()) ? mapping.getValue3() : "date_optional_time";
final TemporalAccessor dt = Joda.forPattern(format).parse(mapping.getValue2());
dataMap.put(mapping.getValue1(), Joda.forPattern("date_optional_time").format(dt));
final Date dt =
FessFunctions.parseDate(values[0],
StringUtil.isNotBlank(mapping.getValue3()) ? mapping.getValue3()
: Constants.DATE_OPTIONAL_TIME);
if (dt != null) {
dataMap.put(mapping.getValue1(), FessFunctions.formatDate(dt));
} else {
logger.warn("Failed to parse " + mapping.toString());
}
} else {
logger.warn("Unknown mapping type: {}={}", key, mapping);
}
@ -271,9 +275,9 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
final Date lastModified = getLastModified(dataMap, responseData);
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
dataMap.put(fessConfig.getIndexFieldLastModified(), lastModified); // overwrite
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
@ -331,6 +335,28 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
return dataMap;
}
protected Date getLastModified(final Map<String, Object> dataMap, final ResponseData responseData) {
final Object lastModifiedObj = dataMap.get(fessConfig.getIndexFieldLastModified());
if (lastModifiedObj instanceof Date) {
return (Date) lastModifiedObj;
} else if (lastModifiedObj instanceof String) {
final Date lastModified = FessFunctions.parseDate(lastModifiedObj.toString());
if (lastModified != null) {
return lastModified;
}
} else if (lastModifiedObj instanceof String[]) {
final String[] lastModifieds = (String[]) lastModifiedObj;
if (lastModifieds.length > 0) {
final Date lastModified = FessFunctions.parseDate(lastModifieds[0]);
if (lastModified != null) {
return lastModified;
}
}
}
return responseData.getLastModified();
}
protected boolean hasTitle(final Map<String, Object> dataMap) {
final Object titleObj = dataMap.get(fessConfig.getIndexFieldTitle());
if (titleObj != null) {
@ -366,7 +392,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
return new ExtractData();
}
private String getResourceName(final ResponseData responseData) {
protected String getResourceName(final ResponseData responseData) {
String name = responseData.getUrl();
final String enc = responseData.getCharSet();

View file

@ -288,6 +288,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. title=title:string
Title=title:string
last_modified=Last-Save-Date:date
last_modified=Last-Modified:date
*/
String CRAWLER_METADATA_NAME_MAPPING = "crawler.metadata.name.mapping";
@ -2208,6 +2210,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
* Get the value for the key 'crawler.metadata.name.mapping'. <br>
* The value is, e.g. title=title:string
Title=title:string
last_modified=Last-Save-Date:date
last_modified=Last-Modified:date
<br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
@ -8615,7 +8619,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
defaultMap.put(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES, "404");
defaultMap.put(FessConfig.CRAWLER_SYSTEM_MONITOR_INTERVAL, "60");
defaultMap.put(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES, "resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*");
defaultMap.put(FessConfig.CRAWLER_METADATA_NAME_MAPPING, "title=title:string\nTitle=title:string\n");
defaultMap.put(FessConfig.CRAWLER_METADATA_NAME_MAPPING,
"title=title:string\nTitle=title:string\nlast_modified=Last-Save-Date:date\nlast_modified=Last-Modified:date\n");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH, "//BODY");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH, "//HTML/@lang");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DIGEST_XPATH, "//META[@name='description']/@content");

View file

@ -23,7 +23,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.DecimalFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
@ -46,6 +45,7 @@ import org.codelibs.fess.Constants;
import org.codelibs.fess.entity.FacetQueryView;
import org.codelibs.fess.helper.ViewHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.elasticsearch.common.joda.Joda;
import org.lastaflute.di.util.LdiURLUtil;
import org.lastaflute.web.util.LaRequestUtil;
import org.lastaflute.web.util.LaResponseUtil;
@ -113,7 +113,7 @@ public class FessFunctions {
}
public static Date parseDate(final String value) {
return parseDate(value, Constants.ISO_DATETIME_FORMAT);
return parseDate(value, Constants.DATE_OPTIONAL_TIME);
}
public static Date parseDate(final String value, final String format) {
@ -121,10 +121,9 @@ public class FessFunctions {
return null;
}
try {
final SimpleDateFormat sdf = new SimpleDateFormat(format);
sdf.setTimeZone(Constants.TIMEZONE_UTC);
return sdf.parse(value);
} catch (final ParseException e) {
final long time = Joda.forPattern(format).parseMillis(value);
return new Date(time);
} catch (final Exception e) {
return null;
}
}

View file

@ -177,6 +177,8 @@ crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Co
crawler.metadata.name.mapping=\
title=title:string\n\
Title=title:string\n\
last_modified=Last-Save-Date:date\n\
last_modified=Last-Modified:date\n\
# html
crawler.document.html.content.xpath=//BODY

View file

@ -0,0 +1,45 @@
/*
* Copyright 2012-2019 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.taglib;
import java.util.Date;
import org.codelibs.fess.unit.UnitFessTestCase;
public class FessFunctionsTest extends UnitFessTestCase {
public void test_parseDate() {
Date date;
date = FessFunctions.parseDate("");
assertNull(date);
date = FessFunctions.parseDate("2004-04-01T12:34:56.123Z");
assertEquals("2004-04-01T12:34:56.123Z", FessFunctions.formatDate(date));
date = FessFunctions.parseDate("2004-04-01T12:34:56Z");
assertEquals("2004-04-01T12:34:56.000Z", FessFunctions.formatDate(date));
date = FessFunctions.parseDate("2004-04-01T12:34Z");
assertEquals("2004-04-01T12:34:00.000Z", FessFunctions.formatDate(date));
date = FessFunctions.parseDate("2004-04-01");
assertEquals("2004-04-01T00:00:00.000Z", FessFunctions.formatDate(date));
date = FessFunctions.parseDate("2004-04-01T12:34:56.123+09:00");
assertEquals("2004-04-01T03:34:56.123Z", FessFunctions.formatDate(date));
}
}