This commit is contained in:
Shinsuke Sugaya 2016-02-09 22:54:01 +09:00
parent 684507731e
commit e7c6906bac
5 changed files with 68 additions and 96 deletions

View file

@ -59,7 +59,7 @@
<utflute.version>0.6.0F</utflute.version>
<!-- Crawler -->
<crawler.version>1.0.3</crawler.version>
<crawler.version>1.0.4-SNAPSHOT</crawler.version>
<!-- Suggest -->
<suggest.version>2.1.1</suggest.version>

View file

@ -161,7 +161,7 @@ public class FessCrawlerThread extends CrawlerThread {
}
} finally {
if (responseData != null) {
IOUtils.closeQuietly(responseData.getResponseBody());
IOUtils.closeQuietly(responseData);
}
}
}

View file

@ -27,7 +27,6 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.TikaMetadataKeys;
@ -79,12 +78,11 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || responseData.getResponseBody() == null) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final Extractor extractor = getExtractor(responseData);
final InputStream in = responseData.getResponseBody();
final Map<String, String> params = new HashMap<String, String>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
final String mimeType = responseData.getMimeType();
@ -94,7 +92,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
final Map<String, Object> dataMap = new HashMap<String, Object>();
final Map<String, Object> metaDataMap = new HashMap<>();
String content;
try {
try (final InputStream in = responseData.getResponseBody()) {
final ExtractData extractData = extractor.getText(in, params);
content = extractData.getContent();
if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
@ -148,8 +146,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
rcae.setLogLevel(CrawlingAccessException.WARN);
throw rcae;
} finally {
IOUtils.closeQuietly(in);
}
if (content == null) {
content = StringUtil.EMPTY;
@ -308,7 +304,10 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
}
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// TODO date
// TODO lang
// lang
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
}
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId

View file

@ -16,9 +16,6 @@
package org.codelibs.fess.crawler.transformer;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
@ -33,7 +30,6 @@ import java.util.Set;
import javax.annotation.PostConstruct;
import javax.xml.transform.TransformerException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.xpath.objects.XObject;
import org.codelibs.core.io.InputStreamUtil;
@ -51,7 +47,6 @@ import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.ResponseDataUtil;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.helper.CrawlingConfigHelper;
@ -99,88 +94,67 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
final File tempFile = ResponseDataUtil.createResponseBodyFile(responseData);
try {
final DOMParser parser = getDomParser();
BufferedInputStream bis = null;
final DOMParser parser = getDomParser();
try (final BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
bis.mark(UTF8_BOM_SIZE);
bis.read(bomBytes); // NOSONAR
if (!isUtf8BomBytes(bomBytes)) {
bis.reset();
}
final InputSource is = new InputSource(bis);
if (responseData.getCharSet() != null) {
is.setEncoding(responseData.getCharSet());
}
parser.parse(is);
} catch (final Exception e) {
throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
}
final Document document = parser.getDocument();
final Map<String, Object> dataMap = new LinkedHashMap<String, Object>();
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final String path = entry.getValue();
try {
bis = new BufferedInputStream(new FileInputStream(tempFile));
final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
bis.mark(UTF8_BOM_SIZE);
bis.read(bomBytes); // NOSONAR
if (!isUtf8BomBytes(bomBytes)) {
bis.reset();
final XObject xObj = getXPathAPI().eval(document, path);
final int type = xObj.getType();
switch (type) {
case XObject.CLASS_BOOLEAN:
final boolean b = xObj.bool();
putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
break;
case XObject.CLASS_NUMBER:
final double d = xObj.num();
putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
break;
case XObject.CLASS_STRING:
final String str = xObj.str();
putResultDataBody(dataMap, entry.getKey(), str);
break;
case XObject.CLASS_NULL:
case XObject.CLASS_UNKNOWN:
case XObject.CLASS_NODESET:
case XObject.CLASS_RTREEFRAG:
case XObject.CLASS_UNRESOLVEDVARIABLE:
default:
final Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
break;
}
final InputSource is = new InputSource(bis);
if (responseData.getCharSet() != null) {
is.setEncoding(responseData.getCharSet());
}
parser.parse(is);
} catch (final Exception e) {
throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
} finally {
IOUtils.closeQuietly(bis);
}
final Document document = parser.getDocument();
final Map<String, Object> dataMap = new LinkedHashMap<String, Object>();
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final String path = entry.getValue();
try {
final XObject xObj = getXPathAPI().eval(document, path);
final int type = xObj.getType();
switch (type) {
case XObject.CLASS_BOOLEAN:
final boolean b = xObj.bool();
putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
break;
case XObject.CLASS_NUMBER:
final double d = xObj.num();
putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
break;
case XObject.CLASS_STRING:
final String str = xObj.str();
putResultDataBody(dataMap, entry.getKey(), str);
break;
case XObject.CLASS_NULL:
case XObject.CLASS_UNKNOWN:
case XObject.CLASS_NODESET:
case XObject.CLASS_RTREEFRAG:
case XObject.CLASS_UNRESOLVEDVARIABLE:
default:
final Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
break;
}
} catch (final TransformerException e) {
logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
}
}
FileInputStream fis = null;
try {
fis = new FileInputStream(tempFile);
responseData.setResponseBody(fis);
putAdditionalData(dataMap, responseData, document);
} catch (final FileNotFoundException e) {
logger.warn(tempFile + " does not exist.", e);
putAdditionalData(dataMap, responseData, document);
} finally {
IOUtils.closeQuietly(fis);
}
try {
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
} catch (final Exception e) {
throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
}
resultData.setEncoding(charsetName);
} finally {
if (!tempFile.delete()) {
logger.warn("Could not delete a temp file: " + tempFile);
} catch (final TransformerException e) {
logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
}
}
putAdditionalData(dataMap, responseData, document);
try {
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
} catch (final Exception e) {
throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
}
resultData.setEncoding(charsetName);
}
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
@ -243,10 +217,9 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
if (charSet == null) {
charSet = Constants.UTF_8;
}
try {
try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
// cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(),
new String(InputStreamUtil.getBytes(responseData.getResponseBody()), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
} catch (final Exception e) {
logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);

View file

@ -527,7 +527,7 @@ public class ViewHelper implements Serializable {
writeContentType(response, responseData);
writeNoCache(response, responseData);
response.stream(out -> {
try (InputStream is = new BufferedInputStream(responseData.getResponseBody())) {
try (final InputStream is = new BufferedInputStream(responseData.getResponseBody())) {
out.write(is);
} catch (final IOException e) {
if (!"ClientAbortException".equals(e.getClass().getSimpleName())) {