This commit is contained in:
Shinsuke Sugaya 2014-07-13 20:41:38 +09:00
parent dcd9ef4e1a
commit 1ff431a351
11 changed files with 76 additions and 54 deletions

28
pom.xml
View file

@ -69,7 +69,7 @@
<database>h2</database>
<databaseGroupId>com.h2database</databaseGroupId>
<databaseArtifactId>h2</databaseArtifactId>
<databaseVersion>1.3.172</databaseVersion>
<databaseVersion>1.4.178</databaseVersion>
<databaseDriver>org.h2.Driver</databaseDriver>
<databaseUrl>jdbc:h2:file:${basedir}/src/main/webapp/WEB-INF/db/fess</databaseUrl>
<databaseTestUrl>jdbc:h2:file:${basedir}/target/test-classes/db/fess</databaseTestUrl>
@ -83,7 +83,7 @@
<database>mysql</database>
<databaseGroupId>mysql</databaseGroupId>
<databaseArtifactId>mysql-connector-java</databaseArtifactId>
<databaseVersion>5.1.30</databaseVersion>
<databaseVersion>5.1.31</databaseVersion>
<databaseDriver>com.mysql.jdbc.Driver</databaseDriver>
<databaseUrl>jdbc:mysql://localhost:3306/fess_db?noDatetimeStringSync=true&amp;zeroDateTimeBehavior=convertToNull&amp;useUnicode=true&amp;characterEncoding=UTF-8&amp;autoReconnect=true</databaseUrl>
<databaseTestUrl>jdbc:mysql://localhost:3306/fess_testdb?noDatetimeStringSync=true&amp;zeroDateTimeBehavior=convertToNull&amp;useUnicode=true&amp;characterEncoding=UTF-8&amp;autoReconnect=true</databaseTestUrl>
@ -108,7 +108,7 @@
</profiles>
<properties>
<dbflute.version>1.0.4K</dbflute.version>
<s2robot.version>0.7.2</s2robot.version>
<s2robot.version>0.8.0-SNAPSHOT</s2robot.version>
<solr.version>4.8.1</solr.version>
<slf4j.version>1.7.7</slf4j.version>
<poi.version>3.10-FINAL</poi.version>
@ -601,15 +601,15 @@
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.servlet.jsp</groupId>
<artifactId>jsp-api</artifactId>
<version>2.2</version>
<scope>provided</scope>
<groupId>javax.servlet.jsp</groupId>
<artifactId>jsp-api</artifactId>
<version>2.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.mail</groupId>
<artifactId>javax.mail-api</artifactId>
<version>1.5.1</version>
<version>1.5.2</version>
</dependency>
<dependency>
<groupId>javax.activation</groupId>
@ -658,7 +658,7 @@
<dependency>
<groupId>args4j</groupId>
<artifactId>args4j</artifactId>
<version>2.0.26</version>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
@ -688,7 +688,7 @@
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>1.8.4</version>
<version>1.8.6</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
@ -714,22 +714,22 @@
<dependency>
<groupId>org.javassist</groupId>
<artifactId>javassist</artifactId>
<version>3.18.1-GA</version>
<version>3.18.2-GA</version>
</dependency>
<dependency>
<groupId>net.arnx</groupId>
<artifactId>jsonic</artifactId>
<version>1.3.3</version>
<version>1.3.5</version>
</dependency>
<dependency>
<groupId>com.github.jknack</groupId>
<artifactId>handlebars</artifactId>
<version>1.3.0</version>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
<version>2.2.2</version>
<version>2.3.3</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>

View file

@ -6,7 +6,7 @@ DROP TABLE IF EXISTS CLICK_LOG;
DROP TABLE IF EXISTS LABEL_TYPE_TO_ROLE_TYPE_MAPPING;
DROP TABLE IF EXISTS SEARCH_LOG;
DROP TABLE IF EXISTS USER_INFO;
DROP TABLE IF EXISTS DATA_CONFIG_TO_BROWSER_TYPE_MAPPING
DROP TABLE IF EXISTS DATA_CONFIG_TO_BROWSER_TYPE_MAPPING;
DROP TABLE IF EXISTS DATA_CONFIG_TO_LABEL_TYPE_MAPPING;
DROP TABLE IF EXISTS DATA_CONFIG_TO_ROLE_TYPE_MAPPING;
DROP TABLE IF EXISTS DATA_CRAWLING_CONFIG;

Binary file not shown.

Binary file not shown.

View file

@ -34,6 +34,7 @@ import org.codelibs.solr.lib.SolrGroup;
import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.util.SerializeUtil;
import org.seasar.robot.RobotSystemException;
import org.seasar.robot.builder.RequestDataBuilder;
import org.seasar.robot.client.S2RobotClient;
import org.seasar.robot.client.S2RobotClientFactory;
import org.seasar.robot.entity.ResponseData;
@ -196,7 +197,9 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
}
final long startTime = System.currentTimeMillis();
final ResponseData responseData = client.doGet(url);
final ResponseData responseData = client
.execute(RequestDataBuilder.newRequestData().get()
.url(url).build());
responseData.setExecutionTime(System.currentTimeMillis()
- startTime);
responseData.setSessionId((String) dataMap

View file

@ -42,6 +42,7 @@ import jp.sf.fess.util.ComponentUtil;
import org.apache.commons.io.IOUtils;
import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.util.Base64Util;
import org.seasar.robot.builder.RequestDataBuilder;
import org.seasar.robot.client.S2RobotClient;
import org.seasar.robot.client.S2RobotClientFactory;
import org.seasar.robot.entity.ResponseData;
@ -179,7 +180,8 @@ public class CrawlingConfigHelper implements Serializable {
throw new FessSystemException("No S2RobotClient: " + configIdObj
+ ", url: " + url);
}
final ResponseData responseData = client.doGet(url);
final ResponseData responseData = client.execute(RequestDataBuilder
.newRequestData().get().url(url).build());
final HttpServletResponse response = ResponseUtil.getResponse();
writeFileName(response, responseData);
writeContentType(response, responseData);

View file

@ -47,8 +47,10 @@ import org.codelibs.solr.lib.SolrGroup;
import org.codelibs.solr.lib.SolrGroupManager;
import org.codelibs.solr.lib.policy.QueryType;
import org.seasar.robot.S2RobotThread;
import org.seasar.robot.builder.RequestDataBuilder;
import org.seasar.robot.client.S2RobotClient;
import org.seasar.robot.client.smb.SmbClient;
import org.seasar.robot.entity.RequestData;
import org.seasar.robot.entity.ResponseData;
import org.seasar.robot.entity.UrlQueue;
import org.seasar.robot.log.LogType;
@ -88,7 +90,9 @@ public class FessS2RobotThread extends S2RobotThread {
ResponseData responseData = null;
try {
// head method
responseData = client.doHead(urlQueue.getUrl());
responseData = client
.execute(RequestDataBuilder.newRequestData().head()
.url(urlQueue.getUrl()).build());
if (responseData == null) {
return true;
}
@ -217,7 +221,7 @@ public class FessS2RobotThread extends S2RobotThread {
}
protected void storeChildUrlsToQueue(final UrlQueue urlQueue,
final Set<String> childUrlSet) {
final Set<RequestData> childUrlSet) {
if (childUrlSet != null) {
synchronized (robotContext.getAccessCountLock()) {
// add an url
@ -229,7 +233,7 @@ public class FessS2RobotThread extends S2RobotThread {
}
@SuppressWarnings("unchecked")
protected Set<String> getAnchorSet(final Object obj) {
protected Set<RequestData> getAnchorSet(final Object obj) {
List<String> anchorList;
if (obj instanceof String) {
anchorList = new ArrayList<String>();
@ -244,9 +248,10 @@ public class FessS2RobotThread extends S2RobotThread {
return null;
}
final Set<String> childUrlSet = new LinkedHashSet<String>();
final Set<RequestData> childUrlSet = new LinkedHashSet<>();
for (final String anchor : anchorList) {
childUrlSet.add(anchor);
childUrlSet.add(RequestDataBuilder.newRequestData().get()
.url(anchor).build());
}
return childUrlSet;
}
@ -294,7 +299,7 @@ public class FessS2RobotThread extends S2RobotThread {
return null;
}
protected Set<String> getChildUrlSet(final String id) {
protected Set<RequestData> getChildUrlSet(final String id) {
final SolrGroupManager solrGroupManager = ComponentUtil
.getSolrGroupManager();
final SolrGroup solrGroup = solrGroupManager
@ -313,11 +318,12 @@ public class FessS2RobotThread extends S2RobotThread {
if (logger.isDebugEnabled()) {
logger.debug("Found solr documents: " + docList);
}
final Set<String> urlSet = new HashSet<String>(docList.size());
final Set<RequestData> urlSet = new HashSet<>(docList.size());
for (final SolrDocument doc : docList) {
final Object obj = doc.get("url");
if (obj != null) {
urlSet.add(obj.toString());
urlSet.add(RequestDataBuilder.newRequestData().get()
.url(obj.toString()).build());
}
}
return urlSet;

View file

@ -54,8 +54,10 @@ import org.seasar.framework.util.InputStreamUtil;
import org.seasar.framework.util.SerializeUtil;
import org.seasar.robot.RobotCrawlAccessException;
import org.seasar.robot.RobotSystemException;
import org.seasar.robot.builder.RequestDataBuilder;
import org.seasar.robot.client.fs.ChildUrlsException;
import org.seasar.robot.entity.AccessResultData;
import org.seasar.robot.entity.RequestData;
import org.seasar.robot.entity.ResponseData;
import org.seasar.robot.entity.ResultData;
import org.seasar.robot.entity.UrlQueue;
@ -66,7 +68,6 @@ import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.traversal.NodeIterator;
import org.xml.sax.InputSource;
public class FessXpathTransformer extends AbstractFessXpathTransformer {
@ -201,8 +202,9 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null
&& !canonicalUrl.equals(responseData.getUrl())) {
final Set<String> childUrlSet = new HashSet<String>();
childUrlSet.add(canonicalUrl);
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get()
.url(canonicalUrl).build());
throw new ChildUrlsException(childUrlSet);
}
}
@ -498,18 +500,21 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
}
}
protected List<String> getAnchorList(final Document document,
protected List<RequestData> getAnchorList(final Document document,
final ResponseData responseData) {
List<String> anchorList = new ArrayList<String>();
List<RequestData> anchorList = new ArrayList<>();
final String baseHref = getBaseHref(document);
try {
final URL url = new URL(baseHref != null ? baseHref
: responseData.getUrl());
for (final Map.Entry<String, String> entry : childUrlRuleMap
.entrySet()) {
anchorList.addAll(getUrlFromTagAttribute(url, document,
for (String u : getUrlFromTagAttribute(url, document,
entry.getKey(), entry.getValue(),
responseData.getCharSet()));
responseData.getCharSet())) {
anchorList.add(RequestDataBuilder.newRequestData().get()
.url(u).build());
}
}
anchorList = convertChildUrlList(anchorList);
} catch (final Exception e) {
@ -521,20 +526,19 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
}
@Override
protected List<String> convertChildUrlList(final List<String> urlList) {
final List<String> newUrlList = new ArrayList<String>();
protected List<RequestData> convertChildUrlList(
final List<RequestData> urlList) {
if (urlList != null) {
for (String url : urlList) {
for (RequestData requestData : urlList) {
String url = requestData.getUrl();
for (final Map.Entry<String, String> entry : convertUrlMap
.entrySet()) {
url = url.replaceAll(entry.getKey(), entry.getValue());
}
newUrlList.add(replaceOverlappingHost(url));
requestData.setUrl(replaceOverlappingHost(url));
}
}
return newUrlList;
return urlList;
}
public void addPrunedTag(final String tagName) {

View file

@ -33,7 +33,9 @@ import javax.xml.transform.stream.StreamResult;
import org.cyberneko.html.parsers.DOMParser;
import org.seasar.extension.unit.S2TestCase;
import org.seasar.framework.container.ComponentNotFoundRuntimeException;
import org.seasar.robot.builder.RequestDataBuilder;
import org.seasar.robot.client.fs.ChildUrlsException;
import org.seasar.robot.entity.RequestData;
import org.seasar.robot.entity.ResponseData;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
@ -178,32 +180,37 @@ public class FessXpathTransformerTest extends S2TestCase {
}
public void test_convertChildUrlList() {
List<String> urlList = new ArrayList<String>();
List<RequestData> urlList = new ArrayList<>();
urlList = fessXpathTransformer.convertChildUrlList(urlList);
assertEquals(0, urlList.size());
urlList.clear();
urlList.add("http://www.example.com");
urlList.add(RequestDataBuilder.newRequestData().get()
.url("http://www.example.com").build());
urlList = fessXpathTransformer.convertChildUrlList(urlList);
assertEquals(1, urlList.size());
assertEquals("http://www.example.com", urlList.get(0));
assertEquals("http://www.example.com", urlList.get(0).getUrl());
urlList.clear();
urlList.add("http://www.example.com");
urlList.add("http://www.test.com");
urlList.add(RequestDataBuilder.newRequestData().get()
.url("http://www.example.com").build());
urlList.add(RequestDataBuilder.newRequestData().get()
.url("http://www.test.com").build());
urlList = fessXpathTransformer.convertChildUrlList(urlList);
assertEquals(2, urlList.size());
assertEquals("http://www.example.com", urlList.get(0));
assertEquals("http://www.test.com", urlList.get(1));
assertEquals("http://www.example.com", urlList.get(0).getUrl());
assertEquals("http://www.test.com", urlList.get(1).getUrl());
urlList.clear();
urlList.add("feed://www.example.com");
urlList.add("http://www.test.com");
urlList.add(RequestDataBuilder.newRequestData().get()
.url("feed://www.example.com").build());
urlList.add(RequestDataBuilder.newRequestData().get()
.url("http://www.test.com").build());
urlList = fessXpathTransformer.convertChildUrlList(urlList);
assertEquals(2, urlList.size());
assertEquals("http://www.example.com", urlList.get(0));
assertEquals("http://www.test.com", urlList.get(1));
assertEquals("http://www.example.com", urlList.get(0).getUrl());
assertEquals("http://www.test.com", urlList.get(1).getUrl());
}
@ -277,10 +284,10 @@ public class FessXpathTransformerTest extends S2TestCase {
transformer.putAdditionalData(dataMap, responseData, document);
fail();
} catch (final ChildUrlsException e) {
final Set<String> childUrlList = e.getChildUrlList();
final Set<RequestData> childUrlList = e.getChildUrlList();
assertEquals(1, childUrlList.size());
assertEquals("http://example.com/hoge", childUrlList.iterator()
.next());
.next().getUrl());
}
data = "<html><link rel=\"canonical\" href=\"http://example.com/hoge\"><body>aaa</body></html>";
@ -289,10 +296,10 @@ public class FessXpathTransformerTest extends S2TestCase {
transformer.putAdditionalData(dataMap, responseData, document);
fail();
} catch (final ChildUrlsException e) {
final Set<String> childUrlList = e.getChildUrlList();
final Set<RequestData> childUrlList = e.getChildUrlList();
assertEquals(1, childUrlList.size());
assertEquals("http://example.com/hoge", childUrlList.iterator()
.next());
.next().getUrl());
}
}