modify xml files for crawler

This commit is contained in:
Shinsuke Sugaya 2015-08-29 16:23:02 +09:00
parent af13210d13
commit bc46f10861
11 changed files with 225 additions and 227 deletions

View file

@ -32,6 +32,7 @@ import org.codelibs.fess.app.web.base.FessSearchAction;
import org.codelibs.fess.es.exentity.ClickLog;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.helper.SearchLogHelper;
import org.codelibs.fess.helper.ViewHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.util.CharUtil;
import org.elasticsearch.index.query.QueryBuilders;
@ -131,9 +132,9 @@ public class GoAction extends FessSearchAction {
if (isFileSystemPath(url)) {
if (Constants.TRUE.equals(crawlerProperties.getProperty(Constants.SEARCH_FILE_PROXY_PROPERTY, Constants.TRUE))) {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final ViewHelper viewHelper = ComponentUtil.getViewHelper();
try {
crawlingConfigHelper.writeContent(doc);
viewHelper.writeContent(doc);
return null;
} catch (final Exception e) {
logger.error("Failed to load: " + doc, e);

View file

@ -78,9 +78,6 @@ public class Crawler implements Serializable {
@Resource
protected FessEsClient fessEsClient;
@Resource
protected ScreenShotManager screenShotManager;
@Resource
protected WebFsIndexHelper webFsIndexHelper;

View file

@ -16,37 +16,16 @@
package org.codelibs.fess.helper;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.io.IOUtils;
import org.codelibs.core.io.CopyUtil;
import org.codelibs.core.misc.Base64Util;
import org.codelibs.fess.Constants;
import org.codelibs.fess.FessSystemException;
import org.codelibs.fess.app.service.DataConfigService;
import org.codelibs.fess.app.service.FileConfigService;
import org.codelibs.fess.app.service.WebConfigService;
import org.codelibs.fess.es.exentity.CrawlingConfig;
import org.codelibs.fess.es.exentity.CrawlingConfig.ConfigType;
import org.codelibs.fess.helper.UserAgentHelper.UserAgentType;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.builder.RequestDataBuilder;
import org.codelibs.robot.client.S2RobotClient;
import org.codelibs.robot.client.S2RobotClientFactory;
import org.codelibs.robot.entity.ResponseData;
import org.lastaflute.di.core.SingletonLaContainer;
import org.lastaflute.web.util.LaResponseUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -121,135 +100,4 @@ public class CrawlingConfigHelper implements Serializable {
return crawlingConfigMap.get(sessionId);
}
public void writeContent(final Map<String, Object> doc) {
if (logger.isDebugEnabled()) {
logger.debug("writing the content of: " + doc);
}
final FieldHelper fieldHelper = ComponentUtil.getFieldHelper();
final Object configIdObj = doc.get(fieldHelper.configIdField);
if (configIdObj == null) {
throw new FessSystemException("configId is null.");
}
final String configId = configIdObj.toString();
if (configId.length() < 2) {
throw new FessSystemException("Invalid configId: " + configIdObj);
}
final ConfigType configType = getConfigType(configId);
CrawlingConfig config = null;
if (logger.isDebugEnabled()) {
logger.debug("configType: " + configType + ", configId: " + configId);
}
if (ConfigType.WEB == configType) {
final WebConfigService webConfigService = SingletonLaContainer.getComponent(WebConfigService.class);
config = webConfigService.getWebConfig(getId(configId));
} else if (ConfigType.FILE == configType) {
final FileConfigService fileConfigService = SingletonLaContainer.getComponent(FileConfigService.class);
config = fileConfigService.getFileConfig(getId(configId));
} else if (ConfigType.DATA == configType) {
final DataConfigService dataConfigService = SingletonLaContainer.getComponent(DataConfigService.class);
config = dataConfigService.getDataConfig(getId(configId));
}
if (config == null) {
throw new FessSystemException("No crawlingConfig: " + configIdObj);
}
final String url = (String) doc.get(fieldHelper.urlField);
final S2RobotClientFactory robotClientFactory = SingletonLaContainer.getComponent(S2RobotClientFactory.class);
config.initializeClientFactory(robotClientFactory);
final S2RobotClient client = robotClientFactory.getClient(url);
if (client == null) {
throw new FessSystemException("No S2RobotClient: " + configIdObj + ", url: " + url);
}
final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build());
final HttpServletResponse response = LaResponseUtil.getResponse();
writeFileName(response, responseData);
writeContentType(response, responseData);
writeNoCache(response, responseData);
InputStream is = null;
OutputStream os = null;
try {
is = new BufferedInputStream(responseData.getResponseBody());
os = new BufferedOutputStream(response.getOutputStream());
CopyUtil.copy(is, os);
os.flush();
} catch (final IOException e) {
if (!"ClientAbortException".equals(e.getClass().getSimpleName())) {
throw new FessSystemException("Failed to write a content. configId: " + configIdObj + ", url: " + url, e);
}
} finally {
IOUtils.closeQuietly(is);
IOUtils.closeQuietly(os);
}
if (logger.isDebugEnabled()) {
logger.debug("Finished to write " + url);
}
}
protected void writeNoCache(final HttpServletResponse response, final ResponseData responseData) {
response.setHeader("Pragma", "no-cache");
response.setHeader("Cache-Control", "no-cache");
response.setHeader("Expires", "Thu, 01 Dec 1994 16:00:00 GMT");
}
protected void writeFileName(final HttpServletResponse response, final ResponseData responseData) {
final UserAgentHelper userAgentHelper = ComponentUtil.getUserAgentHelper();
final UserAgentType userAgentType = userAgentHelper.getUserAgentType();
String charset = responseData.getCharSet();
if (charset == null) {
charset = Constants.UTF_8;
}
final String name;
final String url = responseData.getUrl();
final int pos = url.lastIndexOf('/');
try {
if (pos >= 0 && pos + 1 < url.length()) {
name = URLDecoder.decode(url.substring(pos + 1), charset);
} else {
name = URLDecoder.decode(url, charset);
}
if (logger.isDebugEnabled()) {
logger.debug("userAgentType: " + userAgentType + ", charset: " + charset + ", name: " + name);
}
switch (userAgentType) {
case IE:
response.setHeader("Content-Disposition", "attachment; filename=\"" + URLEncoder.encode(name, Constants.UTF_8) + "\"");
break;
case OPERA:
response.setHeader("Content-Disposition", "attachment; filename*=utf-8'ja'" + URLEncoder.encode(name, Constants.UTF_8));
break;
case SAFARI:
response.setHeader("Content-Disposition", "attachment; filename=\"" + name + "\"");
break;
case CHROME:
case FIREFOX:
case OTHER:
default:
response.setHeader("Content-Disposition",
"attachment; filename=\"=?utf-8?B?" + Base64Util.encode(name.getBytes(Constants.UTF_8)) + "?=\"");
break;
}
} catch (final Exception e) {
logger.warn("Failed to write a filename: " + responseData, e);
}
}
protected void writeContentType(final HttpServletResponse response, final ResponseData responseData) {
final String mimeType = responseData.getMimeType();
if (logger.isDebugEnabled()) {
logger.debug("mimeType: " + mimeType);
}
if (mimeType == null) {
return;
}
if (mimeType.startsWith("text/")) {
final String charset = response.getCharacterEncoding();
if (charset != null) {
response.setContentType(mimeType + "; charset=" + charset);
return;
}
}
response.setContentType(mimeType);
}
}

View file

@ -16,7 +16,12 @@
package org.codelibs.fess.helper;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
@ -34,21 +39,36 @@ import java.util.regex.Pattern;
import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.CoreLibConstants;
import org.codelibs.core.io.CopyUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Base64Util;
import org.codelibs.core.misc.DynamicProperties;
import org.codelibs.core.net.URLUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.FessSystemException;
import org.codelibs.fess.app.service.DataConfigService;
import org.codelibs.fess.app.service.FileConfigService;
import org.codelibs.fess.app.service.WebConfigService;
import org.codelibs.fess.entity.FacetQueryView;
import org.codelibs.fess.es.exentity.CrawlingConfig;
import org.codelibs.fess.es.exentity.CrawlingConfig.ConfigType;
import org.codelibs.fess.helper.UserAgentHelper.UserAgentType;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.ResourceUtil;
import org.codelibs.robot.builder.RequestDataBuilder;
import org.codelibs.robot.client.S2RobotClient;
import org.codelibs.robot.client.S2RobotClientFactory;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.util.CharUtil;
import org.lastaflute.di.core.SingletonLaContainer;
import org.lastaflute.taglib.function.LaFunctions;
import org.lastaflute.web.util.LaRequestUtil;
import org.lastaflute.web.util.LaResponseUtil;
import org.lastaflute.web.util.LaServletContextUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -456,6 +476,138 @@ public class ViewHelper implements Serializable {
return null;
}
public void writeContent(final Map<String, Object> doc) {
if (logger.isDebugEnabled()) {
logger.debug("writing the content of: " + doc);
}
final FieldHelper fieldHelper = ComponentUtil.getFieldHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final Object configIdObj = doc.get(fieldHelper.configIdField);
if (configIdObj == null) {
throw new FessSystemException("configId is null.");
}
final String configId = configIdObj.toString();
if (configId.length() < 2) {
throw new FessSystemException("Invalid configId: " + configIdObj);
}
final ConfigType configType = crawlingConfigHelper.getConfigType(configId);
CrawlingConfig config = null;
if (logger.isDebugEnabled()) {
logger.debug("configType: " + configType + ", configId: " + configId);
}
if (ConfigType.WEB == configType) {
final WebConfigService webConfigService = SingletonLaContainer.getComponent(WebConfigService.class);
config = webConfigService.getWebConfig(crawlingConfigHelper.getId(configId));
} else if (ConfigType.FILE == configType) {
final FileConfigService fileConfigService = SingletonLaContainer.getComponent(FileConfigService.class);
config = fileConfigService.getFileConfig(crawlingConfigHelper.getId(configId));
} else if (ConfigType.DATA == configType) {
final DataConfigService dataConfigService = SingletonLaContainer.getComponent(DataConfigService.class);
config = dataConfigService.getDataConfig(crawlingConfigHelper.getId(configId));
}
if (config == null) {
throw new FessSystemException("No crawlingConfig: " + configIdObj);
}
final String url = (String) doc.get(fieldHelper.urlField);
final S2RobotClientFactory robotClientFactory = SingletonLaContainer.getComponent(S2RobotClientFactory.class);
config.initializeClientFactory(robotClientFactory);
final S2RobotClient client = robotClientFactory.getClient(url);
if (client == null) {
throw new FessSystemException("No S2RobotClient: " + configIdObj + ", url: " + url);
}
final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build());
final HttpServletResponse response = LaResponseUtil.getResponse();
writeFileName(response, responseData);
writeContentType(response, responseData);
writeNoCache(response, responseData);
InputStream is = null;
OutputStream os = null;
try {
is = new BufferedInputStream(responseData.getResponseBody());
os = new BufferedOutputStream(response.getOutputStream());
CopyUtil.copy(is, os);
os.flush();
} catch (final IOException e) {
if (!"ClientAbortException".equals(e.getClass().getSimpleName())) {
throw new FessSystemException("Failed to write a content. configId: " + configIdObj + ", url: " + url, e);
}
} finally {
IOUtils.closeQuietly(is);
IOUtils.closeQuietly(os);
}
if (logger.isDebugEnabled()) {
logger.debug("Finished to write " + url);
}
}
protected void writeNoCache(final HttpServletResponse response, final ResponseData responseData) {
response.setHeader("Pragma", "no-cache");
response.setHeader("Cache-Control", "no-cache");
response.setHeader("Expires", "Thu, 01 Dec 1994 16:00:00 GMT");
}
protected void writeFileName(final HttpServletResponse response, final ResponseData responseData) {
final UserAgentHelper userAgentHelper = ComponentUtil.getUserAgentHelper();
final UserAgentType userAgentType = userAgentHelper.getUserAgentType();
String charset = responseData.getCharSet();
if (charset == null) {
charset = Constants.UTF_8;
}
final String name;
final String url = responseData.getUrl();
final int pos = url.lastIndexOf('/');
try {
if (pos >= 0 && pos + 1 < url.length()) {
name = URLDecoder.decode(url.substring(pos + 1), charset);
} else {
name = URLDecoder.decode(url, charset);
}
if (logger.isDebugEnabled()) {
logger.debug("userAgentType: " + userAgentType + ", charset: " + charset + ", name: " + name);
}
switch (userAgentType) {
case IE:
response.setHeader("Content-Disposition", "attachment; filename=\"" + URLEncoder.encode(name, Constants.UTF_8) + "\"");
break;
case OPERA:
response.setHeader("Content-Disposition", "attachment; filename*=utf-8'ja'" + URLEncoder.encode(name, Constants.UTF_8));
break;
case SAFARI:
response.setHeader("Content-Disposition", "attachment; filename=\"" + name + "\"");
break;
case CHROME:
case FIREFOX:
case OTHER:
default:
response.setHeader("Content-Disposition",
"attachment; filename=\"=?utf-8?B?" + Base64Util.encode(name.getBytes(Constants.UTF_8)) + "?=\"");
break;
}
} catch (final Exception e) {
logger.warn("Failed to write a filename: " + responseData, e);
}
}
protected void writeContentType(final HttpServletResponse response, final ResponseData responseData) {
final String mimeType = responseData.getMimeType();
if (logger.isDebugEnabled()) {
logger.debug("mimeType: " + mimeType);
}
if (mimeType == null) {
return;
}
if (mimeType.startsWith("text/")) {
final String charset = response.getCharacterEncoding();
if (charset != null) {
response.setContentType(mimeType + "; charset=" + charset);
return;
}
}
response.setContentType(mimeType);
}
public boolean isUseSession() {
return useSession;
}

View file

@ -64,7 +64,7 @@ public class ResourceUtil {
if (servletContext != null) {
path = servletContext.getRealPath("/" + baseName + name);
}
} catch (final Exception e) { // NOSONAR
} catch (final Throwable e) { // NOSONAR
// ignore
}
if (path == null) {

View file

@ -297,4 +297,46 @@
</postConstruct>
</component>
<component name="screenShotManager" class="org.codelibs.fess.screenshot.ScreenShotManager">
<!--
<postConstruct name="add">
<arg>htmlScreenShotGenerator</arg>
</postConstruct>
-->
</component>
<!--
<component name="webDriver" class="org.openqa.selenium.phantomjs.PhantomJSDriver">
<arg>
<component class="org.openqa.selenium.remote.DesiredCapabilities">
<postConstruct name="setCapability">
<arg>"phantomjs.binary.path"</arg>
<arg>"/usr/bin/phantomjs"</arg>
</postConstruct>
</component>
</arg>
<preDestroy name="quit"></preDestroy>
</component>
<component name="htmlScreenShotGenerator" class="org.codelibs.fess.screenshot.impl.WebDriverGenerator">
<property name="webDriver">webDriver</property>
<postConstruct name="addCondition">
<arg>"mimetype"</arg>
<arg>"text/html"</arg>
</postConstruct>
</component>
-->
<!--
<component name="htmlScreenShotGenerator" class="org.codelibs.fess.screenshot.impl.CommandGenerator">
<property name="commandList">
{"bash",
"/opt/fess/bin/html-screenshot.sh",
"${url}",
"${outputFile}"}
</property>
<postConstruct name="addCondition">
<arg>"mimetype"</arg>
<arg>"text/html"</arg>
</postConstruct>
</component>
-->
</components>

View file

@ -156,45 +156,4 @@
<property name="roleSeparator">","</property>
-->
</component>
<component name="screenShotManager" class="org.codelibs.fess.screenshot.ScreenShotManager">
<!--
<postConstruct name="add">
<arg>htmlScreenShotGenerator</arg>
</postConstruct>
-->
</component>
<!--
<component name="webDriver" class="org.openqa.selenium.phantomjs.PhantomJSDriver">
<arg>
<component class="org.openqa.selenium.remote.DesiredCapabilities">
<postConstruct name="setCapability">
<arg>"phantomjs.binary.path"</arg>
<arg>"/usr/bin/phantomjs"</arg>
</postConstruct>
</component>
</arg>
<preDestroy name="quit"></preDestroy>
</component>
<component name="htmlScreenShotGenerator" class="org.codelibs.fess.screenshot.impl.WebDriverGenerator">
<property name="webDriver">webDriver</property>
<postConstruct name="addCondition">
<arg>"mimetype"</arg>
<arg>"text/html"</arg>
</postConstruct>
</component>
-->
<!--
<component name="htmlScreenShotGenerator" class="org.codelibs.fess.screenshot.impl.CommandGenerator">
<property name="commandList">
{"bash",
"/opt/fess/bin/html-screenshot.sh",
"${url}",
"${outputFile}"}
</property>
<postConstruct name="addCondition">
<arg>"mimetype"</arg>
<arg>"text/html"</arg>
</postConstruct>
</component>
-->
</components>

View file

@ -6,10 +6,10 @@
<component name="contentLengthHelper"
class="org.codelibs.robot.helper.ContentLengthHelper" instance="singleton">
<property name="defaultMaxLength">10485760L</property><!-- 10M -->
<property name="defaultMaxLength">10485760</property><!-- 10M -->
<postConstruct name="addMaxLength">
<arg>"text/html"</arg>
<arg>2621440L</arg><!-- 2.5M -->
<arg>2621440</arg><!-- 2.5M -->
</postConstruct>
</component>
</components>

View file

@ -14,7 +14,7 @@
<property name="invalidUrlPattern">@java.util.regex.Pattern@compile("^\\s*javascript:|^\\s*mailto:|^\\s*irc:|^\\s*skype:|^\\s*callto:",@java.util.regex.Pattern@CASE_INSENSITIVE)</property>
-->
<property name="convertUrlMap">
#{"feed:" : "http:"}
{"feed:" : "http:"}
</property>
<!--
<property name="cacheXpath">"//BODY"</property>

View file

@ -1,9 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//SEASAR//DTD S2Container 2.4//EN"
"http://www.seasar.org/dtd/components24.dtd">
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components>
<include path="convention.xml" />
<include path="lastaflute.xml"/>
<include path="fess.xml" />
<include path="s2robot_es.xml" />
@ -20,12 +19,12 @@
</component>
<component name="intervalControlHelper" class="org.codelibs.fess.helper.IntervalControlHelper">
<!--
<initMethod name="addIntervalRule">
<postConstruct name="addIntervalRule">
<arg>"5:00"</arg>
<arg>"10:00"</arg>
<arg>"2,3,4,5,6"</arg>
<arg>3600000</arg>
</initMethod>
</postConstruct>
-->
</component>
<component name="sambaHelper" class="org.codelibs.fess.helper.SambaHelper">
@ -36,53 +35,53 @@
<property name="maxDocumentCacheSize">5</property>
<property name="unprocessedDocumentSize">100</property>
<property name="threadDump">false</property>
<initMethod name="addBoostDocumentRule">
<postConstruct name="addBoostDocumentRule">
<arg>
<component class="org.codelibs.fess.solr.BoostDocumentRule">
<property name="matchExpression">"url.matches(\".*fess.*\")"</property>
<property name="boostExpression">"1000.0"</property>
</component>
</arg>
</initMethod>
<initMethod name="addDefaultDocValue">
</postConstruct>
<postConstruct name="addDefaultDocValue">
<arg>"FieldName"</arg>
<arg>"VALUE"</arg>
</initMethod>
</postConstruct>
-->
</component>
<component name="fileTypeHelper" class="org.codelibs.fess.helper.FileTypeHelper">
<initMethod name="add">
<postConstruct name="add">
<arg>"text/html"</arg>
<arg>"html"</arg>
</initMethod>
<initMethod name="add">
</postConstruct>
<postConstruct name="add">
<arg>"application/msword"</arg>
<arg>"word"</arg>
</initMethod>
<initMethod name="add">
</postConstruct>
<postConstruct name="add">
<arg>"application/vnd.openxmlformats-officedocument.wordprocessingml.document"</arg>
<arg>"word"</arg>
</initMethod>
<initMethod name="add">
</postConstruct>
<postConstruct name="add">
<arg>"application/vnd.ms-excel"</arg>
<arg>"excel"</arg>
</initMethod>
<initMethod name="add">
</postConstruct>
<postConstruct name="add">
<arg>"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"</arg>
<arg>"excel"</arg>
</initMethod>
<initMethod name="add">
</postConstruct>
<postConstruct name="add">
<arg>"application/vnd.ms-powerpoint"</arg>
<arg>"powerpoint"</arg>
</initMethod>
<initMethod name="add">
</postConstruct>
<postConstruct name="add">
<arg>"application/vnd.openxmlformats-officedocument.presentationml.presentation"</arg>
<arg>"powerpoint"</arg>
</initMethod>
<initMethod name="add">
</postConstruct>
<postConstruct name="add">
<arg>"application/pdf"</arg>
<arg>"pdf"</arg>
</initMethod>
</postConstruct>
</component>
<component name="fessCrawler" class="org.codelibs.fess.exec.Crawler"
instance="prototype">