Shinsuke Sugaya 9 年之前
父节点
当前提交
e7c6906bac

+ 1 - 1
pom.xml

@@ -59,7 +59,7 @@
 		<utflute.version>0.6.0F</utflute.version>
 
 		<!-- Crawler -->
-		<crawler.version>1.0.3</crawler.version>
+		<crawler.version>1.0.4-SNAPSHOT</crawler.version>
 
 		<!-- Suggest -->
 		<suggest.version>2.1.1</suggest.version>

+ 1 - 1
src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java

@@ -161,7 +161,7 @@ public class FessCrawlerThread extends CrawlerThread {
                 }
             } finally {
                 if (responseData != null) {
-                    IOUtils.closeQuietly(responseData.getResponseBody());
+                    IOUtils.closeQuietly(responseData);
                 }
             }
         }

+ 6 - 7
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

@@ -27,7 +27,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
-import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.TikaMetadataKeys;
@@ -79,12 +78,11 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
 
     @Override
     public ResultData transform(final ResponseData responseData) {
-        if (responseData == null || responseData.getResponseBody() == null) {
+        if (responseData == null || !responseData.hasResponseBody()) {
             throw new CrawlingAccessException("No response body.");
         }
 
         final Extractor extractor = getExtractor(responseData);
-        final InputStream in = responseData.getResponseBody();
         final Map<String, String> params = new HashMap<String, String>();
         params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
         final String mimeType = responseData.getMimeType();
@@ -94,7 +92,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         final Map<String, Object> dataMap = new HashMap<String, Object>();
         final Map<String, Object> metaDataMap = new HashMap<>();
         String content;
-        try {
+        try (final InputStream in = responseData.getResponseBody()) {
             final ExtractData extractData = extractor.getText(in, params);
             content = extractData.getContent();
             if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
@@ -148,8 +146,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
             final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
             rcae.setLogLevel(CrawlingAccessException.WARN);
             throw rcae;
-        } finally {
-            IOUtils.closeQuietly(in);
         }
         if (content == null) {
             content = StringUtil.EMPTY;
@@ -308,7 +304,10 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         }
         putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
         // TODO date
-        // TODO lang
+        // lang
+        if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
+            putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
+        }
         // id
         putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
         // parentId

+ 56 - 83
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -16,9 +16,6 @@
 package org.codelibs.fess.crawler.transformer;
 
 import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
@@ -33,7 +30,6 @@ import java.util.Set;
 import javax.annotation.PostConstruct;
 import javax.xml.transform.TransformerException;
 
-import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.xpath.objects.XObject;
 import org.codelibs.core.io.InputStreamUtil;
@@ -51,7 +47,6 @@ import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
 import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
-import org.codelibs.fess.crawler.util.ResponseDataUtil;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
@@ -99,88 +94,67 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
     @Override
     protected void storeData(final ResponseData responseData, final ResultData resultData) {
-        final File tempFile = ResponseDataUtil.createResponseBodyFile(responseData);
-        try {
-            final DOMParser parser = getDomParser();
-            BufferedInputStream bis = null;
-            try {
-                bis = new BufferedInputStream(new FileInputStream(tempFile));
-                final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
-                bis.mark(UTF8_BOM_SIZE);
-                bis.read(bomBytes); // NOSONAR
-                if (!isUtf8BomBytes(bomBytes)) {
-                    bis.reset();
-                }
-                final InputSource is = new InputSource(bis);
-                if (responseData.getCharSet() != null) {
-                    is.setEncoding(responseData.getCharSet());
-                }
-                parser.parse(is);
-            } catch (final Exception e) {
-                throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
-            } finally {
-                IOUtils.closeQuietly(bis);
+        final DOMParser parser = getDomParser();
+        try (final BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
+            final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
+            bis.mark(UTF8_BOM_SIZE);
+            bis.read(bomBytes); // NOSONAR
+            if (!isUtf8BomBytes(bomBytes)) {
+                bis.reset();
             }
-
-            final Document document = parser.getDocument();
-
-            final Map<String, Object> dataMap = new LinkedHashMap<String, Object>();
-            for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
-                final String path = entry.getValue();
-                try {
-                    final XObject xObj = getXPathAPI().eval(document, path);
-                    final int type = xObj.getType();
-                    switch (type) {
-                    case XObject.CLASS_BOOLEAN:
-                        final boolean b = xObj.bool();
-                        putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
-                        break;
-                    case XObject.CLASS_NUMBER:
-                        final double d = xObj.num();
-                        putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
-                        break;
-                    case XObject.CLASS_STRING:
-                        final String str = xObj.str();
-                        putResultDataBody(dataMap, entry.getKey(), str);
-                        break;
-                    case XObject.CLASS_NULL:
-                    case XObject.CLASS_UNKNOWN:
-                    case XObject.CLASS_NODESET:
-                    case XObject.CLASS_RTREEFRAG:
-                    case XObject.CLASS_UNRESOLVEDVARIABLE:
-                    default:
-                        final Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
-                        putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
-                        break;
-                    }
-                } catch (final TransformerException e) {
-                    logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
-                }
+            final InputSource is = new InputSource(bis);
+            if (responseData.getCharSet() != null) {
+                is.setEncoding(responseData.getCharSet());
             }
+            parser.parse(is);
+        } catch (final Exception e) {
+            throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
+        }
 
-            FileInputStream fis = null;
-            try {
-                fis = new FileInputStream(tempFile);
-                responseData.setResponseBody(fis);
-                putAdditionalData(dataMap, responseData, document);
-            } catch (final FileNotFoundException e) {
-                logger.warn(tempFile + " does not exist.", e);
-                putAdditionalData(dataMap, responseData, document);
-            } finally {
-                IOUtils.closeQuietly(fis);
-            }
+        final Document document = parser.getDocument();
 
+        final Map<String, Object> dataMap = new LinkedHashMap<String, Object>();
+        for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
+            final String path = entry.getValue();
             try {
-                resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
-            } catch (final Exception e) {
-                throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
-            }
-            resultData.setEncoding(charsetName);
-        } finally {
-            if (!tempFile.delete()) {
-                logger.warn("Could not delete a temp file: " + tempFile);
+                final XObject xObj = getXPathAPI().eval(document, path);
+                final int type = xObj.getType();
+                switch (type) {
+                case XObject.CLASS_BOOLEAN:
+                    final boolean b = xObj.bool();
+                    putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
+                    break;
+                case XObject.CLASS_NUMBER:
+                    final double d = xObj.num();
+                    putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
+                    break;
+                case XObject.CLASS_STRING:
+                    final String str = xObj.str();
+                    putResultDataBody(dataMap, entry.getKey(), str);
+                    break;
+                case XObject.CLASS_NULL:
+                case XObject.CLASS_UNKNOWN:
+                case XObject.CLASS_NODESET:
+                case XObject.CLASS_RTREEFRAG:
+                case XObject.CLASS_UNRESOLVEDVARIABLE:
+                default:
+                    final Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
+                    putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
+                    break;
+                }
+            } catch (final TransformerException e) {
+                logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
             }
         }
+
+        putAdditionalData(dataMap, responseData, document);
+
+        try {
+            resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
+        } catch (final Exception e) {
+            throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
+        }
+        resultData.setEncoding(charsetName);
     }
 
     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
@@ -243,10 +217,9 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
                 if (charSet == null) {
                     charSet = Constants.UTF_8;
                 }
-                try {
+                try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
                     // cache
-                    putResultDataBody(dataMap, fessConfig.getIndexFieldCache(),
-                            new String(InputStreamUtil.getBytes(responseData.getResponseBody()), charSet));
+                    putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
                     putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
                 } catch (final Exception e) {
                     logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);

+ 1 - 1
src/main/java/org/codelibs/fess/helper/ViewHelper.java

@@ -527,7 +527,7 @@ public class ViewHelper implements Serializable {
         writeContentType(response, responseData);
         writeNoCache(response, responseData);
         response.stream(out -> {
-            try (InputStream is = new BufferedInputStream(responseData.getResponseBody())) {
+            try (final InputStream is = new BufferedInputStream(responseData.getResponseBody())) {
                 out.write(is);
             } catch (final IOException e) {
                 if (!"ClientAbortException".equals(e.getClass().getSimpleName())) {