init:yioop-4.0.1
This commit is contained in:
commit
3b97f4ec02
519 changed files with 186218 additions and 0 deletions
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
local_config.php
|
||||
LocalConfig.php
|
||||
.DS_Store
|
||||
/vendor/
|
||||
/work_directory/
|
12
.htaccess
Normal file
12
.htaccess
Normal file
|
@ -0,0 +1,12 @@
|
|||
Options +FollowSymLinks
|
||||
RewriteEngine On
|
||||
|
||||
RewriteRule "^wd/(css|scripts|locale)/(.*)$" work_directory/app/$1/$2 [L]
|
||||
RewriteRule "^wd/resources/(.*)/(.*)/(.*)/(.*)/(.*)$" index.php?c=resource&a=get&f=resources&$1&g=$2&p=$3&sf=$4&n=$5 [L]
|
||||
RewriteRule "^wd/resources/(.*)/(.*)/(.*)/(.*)$" index.php?c=resource&a=get&f=resources&$1&g=$2&p=$3&n=$4 [L]
|
||||
|
||||
RewriteRule "^(css|resources|scripts|locale)/(.*)$" src/$1/$2 [L]
|
||||
RewriteRule "^((.*)bar.xml|favicon.ico|robots.txt)$" src/$1 [L]
|
||||
|
||||
RewriteCond %{REQUEST_FILENAME} !index.php|tests/.*$|(src|work_directory/app)/(favicon.ico$|robots.txt$|yioopbar.xml$|(css|scripts|resources/.*$|locale/.*$))
|
||||
RewriteRule ^ index.php [L]
|
51
INSTALL
Normal file
51
INSTALL
Normal file
|
@ -0,0 +1,51 @@
|
|||
SeekQuarry/Yioop --
|
||||
Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
|
||||
Copyright (C) 2009 - 2015 Chris Pollett chris@pollett.org
|
||||
|
||||
http://www.seekquarry.com/
|
||||
|
||||
LICENSE:
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
What follows is a brief summary of how to install Yioop!
|
||||
More details about installation and configuration (including screenshots)
|
||||
can be found at:
|
||||
http://www.seekquarry.com/?c=main&p=documentation
|
||||
|
||||
Installation
|
||||
-------------
|
||||
(1) Move the Yioop search engine into some folder under your
|
||||
web server's document root.
|
||||
|
||||
(2)管理账号:root 密码为空
|
||||
|
||||
(3)
|
||||
|
||||
(4) To start a crawl, you need to point your browser at the url
|
||||
of your Yioop installation. Click on the admin link,
|
||||
then the manage crawl link. Type in a description of your
|
||||
crawl and click "Start New Crawl". After about a minute you should
|
||||
start the Currently Processing and Most Recent Urls sections updating
|
||||
with info about the current crawl.
|
||||
|
||||
(5) After running your crawl for a while you can click the Stop
|
||||
button to stop it. The crawl should show up after a delay in the
|
||||
Previous Crawls table. Their you can choose to resume a crawl,
|
||||
delete it, or set it as the current index.
|
||||
|
||||
(6) If you set a crawl as the current index, when you go
|
||||
back to your installations web page and type a query in the search bar,
|
||||
the query will be answered with the results from that crawl.
|
683
LICENSE
Normal file
683
LICENSE
Normal file
|
@ -0,0 +1,683 @@
|
|||
SeekQuarry/Yioop --
|
||||
Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
|
||||
Copyright (C) 2009 - 2015 Chris Pollett chris@pollett.org
|
||||
|
||||
SeekQuarry is distributed under the terms of GNU LIBRARY GENERAL PUBLIC
|
||||
LICENSE reproduced below.
|
||||
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU General Public License is a free, copyleft license for
|
||||
software and other kinds of works.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
the GNU General Public License is intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users. We, the Free Software Foundation, use the
|
||||
GNU General Public License for most of our software; it applies also to
|
||||
any other work released this way by its authors. You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to prevent others from denying you
|
||||
these rights or asking you to surrender the rights. Therefore, you have
|
||||
certain responsibilities if you distribute copies of the software, or if
|
||||
you modify it: responsibilities to respect the freedom of others.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must pass on to the recipients the same
|
||||
freedoms that you received. You must make sure that they, too, receive
|
||||
or can get the source code. And you must show them these terms so they
|
||||
know their rights.
|
||||
|
||||
Developers that use the GNU GPL protect your rights with two steps:
|
||||
(1) assert copyright on the software, and (2) offer you this License
|
||||
giving you legal permission to copy, distribute and/or modify it.
|
||||
|
||||
For the developers' and authors' protection, the GPL clearly explains
|
||||
that there is no warranty for this free software. For both users' and
|
||||
authors' sake, the GPL requires that modified versions be marked as
|
||||
changed, so that their problems will not be attributed erroneously to
|
||||
authors of previous versions.
|
||||
|
||||
Some devices are designed to deny users access to install or run
|
||||
modified versions of the software inside them, although the manufacturer
|
||||
can do so. This is fundamentally incompatible with the aim of
|
||||
protecting users' freedom to change the software. The systematic
|
||||
pattern of such abuse occurs in the area of products for individuals to
|
||||
use, which is precisely where it is most unacceptable. Therefore, we
|
||||
have designed this version of the GPL to prohibit the practice for those
|
||||
products. If such problems arise substantially in other domains, we
|
||||
stand ready to extend this provision to those domains in future versions
|
||||
of the GPL, as needed to protect the freedom of users.
|
||||
|
||||
Finally, every program is threatened constantly by software patents.
|
||||
States should not allow patents to restrict development and use of
|
||||
software on general-purpose computers, but in those that do, we wish to
|
||||
avoid the special danger that patents applied to a free program could
|
||||
make it effectively proprietary. To prevent this, the GPL assures that
|
||||
patents cannot be used to render the program non-free.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Use with the GNU Affero General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU Affero General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the special requirements of the GNU Affero General Public License,
|
||||
section 13, concerning interaction through a network will apply to the
|
||||
combination as such.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, your program's commands
|
||||
might be different; for a GUI interface, you would use an "about box".
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU GPL, see
|
||||
<http://www.gnu.org/licenses/>.
|
||||
|
||||
The GNU General Public License does not permit incorporating your program
|
||||
into proprietary programs. If your program is a subroutine library, you
|
||||
may consider it more useful to permit linking proprietary applications with
|
||||
the library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License. But first, please read
|
||||
<http://www.gnu.org/philosophy/why-not-lgpl.html>.
|
||||
|
72
README
Normal file
72
README
Normal file
|
@ -0,0 +1,72 @@
|
|||
SeekQuarry/Yioop --
|
||||
Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
|
||||
Copyright (C) 2009 - 2015 Chris Pollett chris@pollett.org
|
||||
|
||||
http://www.seekquarry.com/
|
||||
|
||||
LICENSE:
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
Summary
|
||||
-------
|
||||
The Yioop search engine consists of three main
|
||||
scripts:
|
||||
|
||||
src/executables/Fetcher.php - used to download batches of urls provided
|
||||
the queue_server.
|
||||
src/executables/queue_server.php - maintains a queue of urls that are
|
||||
going to be scheduled to be seen. It also keeps
|
||||
track of what has been seen and robots.txt info.
|
||||
Its last responsibility is to create the index_archive
|
||||
that is used by the search front end.
|
||||
|
||||
index.php -- a search engine web page. It is also used
|
||||
to handle message passing between the fetchers
|
||||
(multiple machines can act as fetchers) and the
|
||||
queue_server.
|
||||
|
||||
Download
|
||||
--------
|
||||
You can download the SeekQuarry search engine from
|
||||
http://www.seekquarry.com/
|
||||
|
||||
Requirements
|
||||
------------
|
||||
The Yioop search engine requires Apache and
|
||||
PHP. It was developed under Apache 2.2, PHP 5.4,
|
||||
and the sqlite3 built into PHP.
|
||||
|
||||
Credits
|
||||
------
|
||||
The source code is mainly due to Chris Pollett.
|
||||
Other contributors include: Mangesh Dahale, Ravi Dhillon, Priya Gangaraju,
|
||||
Akshat Kukreti, Pooja Mishra, Sreenidhi Pundi Muralidharan,
|
||||
Nakul Natu, Shailesh Padave, Vijaya Pamidi, Snigdha Parvatneni,
|
||||
Akash Patel, Vijeth Patil, Mallika Perepa, Tarun Pepira,
|
||||
Eswara Rajesh Pinapala, Tamayee Potluri, Shawn Tice, Pushkar Umaranikar,
|
||||
Sandhya Vissapragada. Several people helped with localization:
|
||||
My wife, Mary Pollett, Jonathan Ben-David, Ismail.B, Andrea Brunetti,
|
||||
Thanh Bui, Sujata Dongre, Animesh Dutta, Aida Khosroshahi, Radha Kotipalli,
|
||||
Youn Kim, Akshat Kukreti, Chao-Hsin Shih, Ahmed Kamel Taha, and Sugi Widjaja.
|
||||
|
||||
Installation
|
||||
-------------
|
||||
Please see the INSTALL file
|
||||
|
||||
Documentation and Support
|
||||
-------------------------
|
||||
Please check out seekquarry.com
|
34
composer.json
Normal file
34
composer.json
Normal file
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"name": "seekquarry/yioop",
|
||||
"description": "Search Engine Portal with Front-end, Indexer, Crawler, Wiki, and NLP libraries",
|
||||
"homepage": "https://www.seekquarry.com/",
|
||||
"license": "GPLv3",
|
||||
"authors": [
|
||||
{
|
||||
"name": "Chris Pollett",
|
||||
"email": "chris@pollett.org"
|
||||
}
|
||||
],
|
||||
"minimum-stability": "stable",
|
||||
"require": {
|
||||
"php": ">=5.4.0",
|
||||
"ext-dom": "*",
|
||||
"ext-gd": "*",
|
||||
"ext-json": "*",
|
||||
"ext-mbstring": "*",
|
||||
"ext-pcre": "*",
|
||||
"ext-PDO":"*",
|
||||
"ext-pdo_sqlite": "*",
|
||||
"ext-SPL":"*",
|
||||
"ext-zip": "*",
|
||||
"lib-curl": "*",
|
||||
"lib-libxml": "*",
|
||||
"lib-pcre": "*"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"seekquarry\\yioop\\": ["work_directory/app/", "src/"],
|
||||
"seekquarry\\yioop\\tests\\": ["tests/"]
|
||||
}
|
||||
}
|
||||
}
|
32
composer.lock
generated
Normal file
32
composer.lock
generated
Normal file
|
@ -0,0 +1,32 @@
|
|||
{
|
||||
"_readme": [
|
||||
"This file locks the dependencies of your project to a known state",
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"hash": "a9a653eb7781a772e1ab492dd4eee5e8",
|
||||
"content-hash": "9a00ca22dcc4a39f7ecfbdf2444131f2",
|
||||
"packages": [],
|
||||
"packages-dev": [],
|
||||
"aliases": [],
|
||||
"minimum-stability": "stable",
|
||||
"stability-flags": [],
|
||||
"prefer-stable": false,
|
||||
"prefer-lowest": false,
|
||||
"platform": {
|
||||
"php": ">=5.4.0",
|
||||
"ext-dom": "*",
|
||||
"ext-gd": "*",
|
||||
"ext-json": "*",
|
||||
"ext-mbstring": "*",
|
||||
"ext-pcre": "*",
|
||||
"ext-pdo": "*",
|
||||
"ext-pdo_sqlite": "*",
|
||||
"ext-spl": "*",
|
||||
"ext-zip": "*",
|
||||
"lib-curl": "*",
|
||||
"lib-libxml": "*",
|
||||
"lib-pcre": "*"
|
||||
},
|
||||
"platform-dev": []
|
||||
}
|
48
index.php
Normal file
48
index.php
Normal file
|
@ -0,0 +1,48 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop;
|
||||
|
||||
/**
|
||||
* Sends a request from the outer Yioop index.php file on to the inner one
|
||||
* Marks that redirects are on
|
||||
*/
|
||||
function passthruYioopRequest()
|
||||
{
|
||||
$uri = $_SERVER['REQUEST_URI'];
|
||||
$new_uri = preg_replace("@/(\.?/)+@", "/", $uri);
|
||||
if ($new_uri != $uri) {
|
||||
header("Location: $new_uri", true, 301);
|
||||
exit();
|
||||
}
|
||||
define("seekquarry\\yioop\\configs\\REDIRECTS_ON", true);
|
||||
require_once __DIR__."/src/index.php";
|
||||
}
|
||||
passthruYioopRequest();
|
44
src/advertise.php
Normal file
44
src/advertise.php
Normal file
|
@ -0,0 +1,44 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Web page used to display information about the privacy policy of
|
||||
* the SeekQuarry/Yioop Search engine
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop;
|
||||
|
||||
/* set-up static privacy page for display */
|
||||
$_REQUEST['c'] = "static";
|
||||
$_REQUEST['p'] = "advertise";
|
||||
/**
|
||||
* load in main entry point
|
||||
*/
|
||||
require_once(__DIR__."/index.php");
|
||||
bootstrap();
|
||||
exit();
|
42
src/blog.php
Normal file
42
src/blog.php
Normal file
|
@ -0,0 +1,42 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Web page used to display a simple blog about current crawls related to
|
||||
* the SeekQuarry/Yioop Search engine
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop;
|
||||
|
||||
$_REQUEST['c'] = "group";
|
||||
$_REQUEST['a'] = "groupFeeds";
|
||||
$_REQUEST['just_group_id'] = 2;
|
||||
define('seekquarry\\yioop\\configs\\SKIP_BOOTSTRAP', true);
|
||||
require_once(__DIR__."/index.php");
|
||||
bootstrap();
|
||||
exit();
|
41
src/bot.php
Normal file
41
src/bot.php
Normal file
|
@ -0,0 +1,41 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Web page used to display information about the web robot used in
|
||||
* the SeekQuarry/Yioop Search engine
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop;
|
||||
|
||||
$_REQUEST['c'] = "static";
|
||||
$_REQUEST['p'] = "bot";
|
||||
define('seekquarry\\yioop\\configs\\SKIP_BOOTSTRAP', true);
|
||||
require_once(__DIR__."/index.php");
|
||||
bootstrap();
|
||||
exit();
|
944
src/configs/Config.php
Normal file
944
src/configs/Config.php
Normal file
|
@ -0,0 +1,944 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Used to set the configuration settings of the Yioop/SeekQuarry project.
|
||||
*
|
||||
* Some settings can be set in the Page Options and Server Settings
|
||||
* and Appearance activities. Other settings can be overriden by making
|
||||
* a LocalConfig.php file in the same folder as this file and using the
|
||||
* same namespace. If a setting in this file is created using nsdefine
|
||||
* it is unlikely that it is safe to override. If it is created using
|
||||
* nsconddefine it should be fair game for tweaking in the LocalConfig.php
|
||||
* file
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\configs;
|
||||
|
||||
/**
|
||||
* So can autoload classes. We try to use the autoloader that
|
||||
* Composer would define but if that fails we use a default autoloader
|
||||
*/
|
||||
if (file_exists(__DIR__."/../../vendor/autoload.php")) {
|
||||
require_once __DIR__."/../../vendor/autoload.php";
|
||||
} else {
|
||||
spl_autoload_register(function ($class) {
|
||||
// project-specific namespace prefix
|
||||
$prefix = 'seekquarry\\yioop\\tests';
|
||||
// does the class use the namespace prefix?
|
||||
$len = strlen($prefix);
|
||||
if (strncmp($prefix, $class, $len) !== 0) {
|
||||
$prefix = 'seekquarry\\yioop';
|
||||
$len = strlen($prefix);
|
||||
// no, move to the next registered autoloader
|
||||
if (strncmp($prefix, $class, $len) !== 0) {
|
||||
return;
|
||||
} else {
|
||||
$check_dirs = [WORK_DIRECTORY . "/app", BASE_DIR];
|
||||
}
|
||||
} else {
|
||||
$check_dirs = [PARENT_DIR . "/tests"];
|
||||
}
|
||||
// get the relative class name
|
||||
$relative_class = substr($class, $len);
|
||||
// use forward-slashes, add ./php
|
||||
$unixify_class_name = "/".str_replace('\\', '/', $relative_class) .
|
||||
'.php';
|
||||
foreach ($check_dirs as $dir) {
|
||||
$file = $dir . $unixify_class_name;
|
||||
if (file_exists($file)) {
|
||||
require $file;
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Define a constant in the Yioop configs namespace (seekquarry\yioop)
|
||||
* @param string $constant the name of the constant to define
|
||||
* @param $value the value to give it
|
||||
*/
|
||||
function nsdefine($constant, $value)
|
||||
{
|
||||
define("seekquarry\\yioop\\configs\\" . $constant, $value);
|
||||
}
|
||||
/**
|
||||
* Check if a constant has been defined in the yioop configuration
|
||||
* namespace.
|
||||
* @param string $constant the constant to check if defined
|
||||
* @return bool whether or not it was
|
||||
*/
|
||||
function nsdefined($constant)
|
||||
{
|
||||
return defined("seekquarry\\yioop\\configs\\" . $constant);
|
||||
}
|
||||
/**
|
||||
* Define a constant in the Yioop configs namespace (seekquarry\yioop)
|
||||
* if it hasn't been defined yet, otherwise do nothing.
|
||||
* @param string $constant the name of the constant to define
|
||||
* @param $value the value to give it
|
||||
*/
|
||||
function nsconddefine($constant, $value)
|
||||
{
|
||||
if (!defined("seekquarry\\yioop\\configs\\" . $constant)) {
|
||||
define("seekquarry\\yioop\\configs\\" . $constant, $value);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Version number for upgrade function
|
||||
* @var int
|
||||
*/
|
||||
nsdefine('YIOOP_VERSION', 49);
|
||||
/**
|
||||
* Minimum Version fo Yioop for which keyword ad script
|
||||
* still works with this version
|
||||
* @var int
|
||||
*/
|
||||
nsdefine('MIN_AD_VERSION', 36);
|
||||
/**
|
||||
* nsdefine's the BASE_URL constant for this script
|
||||
*/
|
||||
function computeBaseUrl()
|
||||
{
|
||||
$pathinfo = pathinfo($_SERVER['SCRIPT_NAME']);
|
||||
$server_port = isset($_SERVER['HTTP_X_FORWARDED_PORT']) ?
|
||||
$_SERVER['HTTP_X_FORWARDED_PORT'] : (isset($_SERVER['SERVER_PORT']) ?
|
||||
$_SERVER['SERVER_PORT'] : 80);
|
||||
$http = (!empty($_SERVER['HTTPS']) || $server_port == 443) ?
|
||||
"https://" : "http://";
|
||||
$port = ( ($http == "http://" && ($server_port != 80) ||
|
||||
($http == "https://" && $server_port != 443))) ?
|
||||
":" . $server_port : "";
|
||||
$server_name = isset($_SERVER['SERVER_NAME']) ? $_SERVER['SERVER_NAME'] :
|
||||
"localhost";
|
||||
$dir_name = $pathinfo["dirname"];
|
||||
if ($dir_name == ".") {
|
||||
$dir_name = "";
|
||||
}
|
||||
$extra_slash = ($dir_name == '/') ? "" : '/';
|
||||
//used in register controller to create links back to server
|
||||
nsdefine("BASE_URL", $http . $server_name . $port . $dir_name .
|
||||
$extra_slash);
|
||||
}
|
||||
/*
|
||||
pcre is an external library to php which can cause Yioop
|
||||
to seg fault if given instances of reg expressions with
|
||||
large recursion depth on a string.
|
||||
https://bugs.php.net/bug.php?id=47376
|
||||
The goal here is to cut off these problems before they happen.
|
||||
We do this in config.php because it is included in most Yioop
|
||||
files.
|
||||
*/
|
||||
ini_set('pcre.recursion_limit', 3000);
|
||||
ini_set('pcre.backtrack_limit', 1000000);
|
||||
/** Calculate base directory of script
|
||||
* @ignore
|
||||
*/
|
||||
nsconddefine("BASE_DIR", str_replace("\\", "/", realpath(__DIR__ ."/../")));
|
||||
nsconddefine("PARENT_DIR", substr(BASE_DIR, 0, -strlen("/src")));
|
||||
computeBaseUrl();
|
||||
/** Yioop Namespace*/
|
||||
nsdefine('NS', "seekquarry\\yioop\\");
|
||||
/** controllers sub-namespace */
|
||||
nsdefine('NS_CONFIGS', NS . "configs\\");
|
||||
/** controllers sub-namespace */
|
||||
nsdefine('NS_CONTROLLERS', NS . "controllers\\");
|
||||
/** components sub-namespace */
|
||||
nsdefine('NS_COMPONENTS', NS_CONTROLLERS . "components\\");
|
||||
/** executables sub-namespace */
|
||||
nsdefine('NS_EXEC', NS . "executables\\");
|
||||
/** library sub-namespace */
|
||||
nsdefine('NS_LIB', NS . "library\\");
|
||||
/** jobs sub-namespace */
|
||||
nsdefine('NS_JOBS', NS_LIB . "media_jobs\\");
|
||||
/** Models sub-namespace */
|
||||
nsdefine('NS_MODELS', NS . "models\\");
|
||||
/** datasources sub-namespace */
|
||||
nsdefine('NS_DATASOURCES', NS_MODELS . "datasources\\");
|
||||
/** archive_bundle_iterators sub-namespace */
|
||||
nsdefine('NS_ARCHIVE', NS_LIB . "archive_bundle_iterators\\");
|
||||
/** indexing_plugins sub-namespace */
|
||||
nsdefine('NS_PLUGINS', NS_LIB . "indexing_plugins\\");
|
||||
/** indexing_plugins sub-namespace */
|
||||
nsdefine('NS_PROCESSORS', NS_LIB . "processors\\");
|
||||
/** text sumamrizer sub-namespace */
|
||||
nsdefine('NS_SUMMARIZERS', NS_LIB . "summarizers\\");
|
||||
/** locale sub-namespace */
|
||||
nsdefine('NS_LOCALE', NS . "locale\\");
|
||||
/** views sub-namespace */
|
||||
nsdefine('NS_VIEWS', NS . "views\\");
|
||||
/** elements sub-namespace */
|
||||
nsdefine('NS_ELEMENTS', NS_VIEWS . "elements\\");
|
||||
/** helpers sub-namespace */
|
||||
nsdefine('NS_HELPERS', NS_VIEWS . "helpers\\");
|
||||
/** layouts sub-namespace */
|
||||
nsdefine('NS_LAYOUTS', NS_VIEWS . "layouts\\");
|
||||
/** tests sub-namespace */
|
||||
nsdefine('NS_TESTS', NS . "tests\\");
|
||||
/** Don't display any query info*/
|
||||
nsdefine('NO_DEBUG_INFO', 0);
|
||||
/** bit of DEBUG_LEVEL used to indicate test cases should be displayable*/
|
||||
nsdefine('TEST_INFO', 1);
|
||||
/** bit of DEBUG_LEVEL used to indicate query statistics should be displayed*/
|
||||
nsdefine('QUERY_INFO', 2);
|
||||
/** bit of DEBUG_LEVEL used to indicate php messages should be displayed*/
|
||||
nsdefine('ERROR_INFO', 4);
|
||||
/** Maintenance mode restricts access to local machine*/
|
||||
nsdefine("MAINTENANCE_MODE", false);
|
||||
/** Constant used to indicate lasting an arbitrary number of seconds */
|
||||
nsdefine('FOREVER', -2);
|
||||
/** Number of seconds in a day*/
|
||||
nsdefine('ONE_DAY', 86400);
|
||||
/** Number of seconds in a week*/
|
||||
nsdefine('ONE_WEEK', 604800);
|
||||
/** Number of seconds in a 30 day month */
|
||||
nsdefine('ONE_MONTH', 2592000);
|
||||
/** Number of seconds in a 365 day year */
|
||||
nsdefine('ONE_YEAR', 31536000);
|
||||
/** Number of seconds in an hour */
|
||||
nsdefine('ONE_HOUR', 3600);
|
||||
/** Number of seconds in a minute */
|
||||
nsdefine('ONE_MINUTE', 60);
|
||||
/** Number of seconds in a second */
|
||||
nsdefine('ONE_SECOND', 1);
|
||||
if (file_exists(BASE_DIR."/configs/LocalConfig.php")) {
|
||||
/** Include any locally specified defines (could use as an alternative
|
||||
way to set work directory) */
|
||||
require_once(BASE_DIR."/configs/LocalConfig.php");
|
||||
}
|
||||
/** setting Profile.php to something else in LocalConfig.php allows one to have
|
||||
* two different yioop instances share the same work_directory but maybe have
|
||||
* different configuration settings. This might be useful if one was
|
||||
* production and one was more dev.
|
||||
*/
|
||||
nsconddefine('PROFILE_FILE_NAME', "/Profile.php");
|
||||
nsconddefine('MAINTENANCE_MESSAGE', <<<EOD
|
||||
This Yioop! installation is undergoing maintenance, please come back later!
|
||||
EOD
|
||||
);
|
||||
if (MAINTENANCE_MODE && $_SERVER["SERVER_ADDR"] != $_SERVER["REMOTE_ADDR"]) {
|
||||
echo MAINTENANCE_MESSAGE;
|
||||
exit();
|
||||
}
|
||||
|
||||
/** */
|
||||
nsdefine('DEFAULT_WORK_DIRECTORY', PARENT_DIR . "/work_directory");
|
||||
|
||||
if (!nsdefined('WORK_DIRECTORY')) {
|
||||
/*+++ The next block of code is machine edited, change at
|
||||
your own risk, please use configure web page instead +++*/
|
||||
nsdefine('WORK_DIRECTORY', DEFAULT_WORK_DIRECTORY);
|
||||
/*++++++*/
|
||||
// end machine edited code
|
||||
}
|
||||
/** Directory for local versions of web app classes*/
|
||||
nsdefine('APP_DIR', WORK_DIRECTORY."/app");
|
||||
/** Directory to place files such as dictionaries that will be
|
||||
converted to Bloom filter using token_tool.php. Similarly,
|
||||
can be used to hold files which will be used to prepare
|
||||
a file to assist in crawling or serving search results
|
||||
*/
|
||||
nsdefine('PREP_DIR', WORK_DIRECTORY."/prepare");
|
||||
/** Locale dir to use in case LOCALE_DIR does not exist yet or is
|
||||
* missing some file
|
||||
*/
|
||||
nsdefine('FALLBACK_LOCALE_DIR', BASE_DIR."/locale");
|
||||
/** Captcha mode indicating to use a text captcha*/
|
||||
nsdefine('TEXT_CAPTCHA', 1);
|
||||
/** Captcha mode indicating to use a hash cash computation for a captcha*/
|
||||
nsdefine('HASH_CAPTCHA', 2);
|
||||
/** Captcha mode indicating to use a classic image based captcha*/
|
||||
nsdefine('IMAGE_CAPTCHA', 3);
|
||||
/** Authentication Mode Possibility*/
|
||||
nsdefine('NORMAL_AUTHENTICATION', 1);
|
||||
/** Authentication Mode Possibility*/
|
||||
nsdefine('ZKP_AUTHENTICATION', 2);
|
||||
/** */
|
||||
nsdefine('NO_RECOVERY', 0);
|
||||
/** */
|
||||
nsdefine('EMAIL_RECOVERY', 1);
|
||||
/** */
|
||||
nsdefine('EMAIL_AND_QUESTIONS_RECOVERY', 2);
|
||||
/** If ZKP Authentication via Fiat Shamir Protocol used how many iterations
|
||||
* to do
|
||||
*/
|
||||
nsconddefine('FIAT_SHAMIR_ITERATIONS', 20);
|
||||
if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) {
|
||||
if ((file_exists(WORK_DIRECTORY . "/locale/en-US") &&
|
||||
!file_exists(WORK_DIRECTORY . "/locale/en_US"))
|
||||
|| (file_exists(WORK_DIRECTORY . "/app/locale/en-US") &&
|
||||
!file_exists(WORK_DIRECTORY . "/app/locale/en_US"))) {
|
||||
$old_profile = file_get_contents(WORK_DIRECTORY . PROFILE_FILE_NAME);
|
||||
$new_profile = preg_replace('/\<\?php/', "<?php\n".
|
||||
"namespace seekquarry\\yioop\\configs;\n",
|
||||
$old_profile);
|
||||
$new_profile = preg_replace("/(define(?:d?))\(/", 'ns$1(',
|
||||
$new_profile);
|
||||
file_put_contents(WORK_DIRECTORY . PROFILE_FILE_NAME, $new_profile);
|
||||
}
|
||||
require_once WORK_DIRECTORY . PROFILE_FILE_NAME;
|
||||
nsdefine('PROFILE', true);
|
||||
nsdefine('CRAWL_DIR', WORK_DIRECTORY);
|
||||
if (is_dir(APP_DIR."/locale")) {
|
||||
nsdefine('LOCALE_DIR', WORK_DIRECTORY."/app/locale");
|
||||
} else if (is_dir(WORK_DIRECTORY."/locale")) {
|
||||
//old work directory location
|
||||
nsdefine('LOCALE_DIR', WORK_DIRECTORY."/locale");
|
||||
} else {
|
||||
/** @ignore */
|
||||
nsdefine('LOCALE_DIR', FALLBACK_LOCALE_DIR);
|
||||
}
|
||||
nsdefine('LOG_DIR', WORK_DIRECTORY."/log");
|
||||
if (nsdefined('DB_URL') && !nsdefined('DB_HOST')) {
|
||||
nsdefine('DB_HOST', DB_URL); //for backward compatibility
|
||||
}
|
||||
if (nsdefined('QUEUE_SERVER') && !nsdefined('NAME_SERVER')) {
|
||||
nsdefine('NAME_SERVER', QUEUE_SERVER); //for backward compatibility
|
||||
}
|
||||
if (NAME_SERVER == 'http://' || NAME_SERVER == 'https://') {
|
||||
nsdefine("FIX_NAME_SERVER", true);
|
||||
}
|
||||
} else {
|
||||
if ((!isset( $_SERVER['SERVER_NAME']) ||
|
||||
$_SERVER['SERVER_NAME']!=='localhost')
|
||||
&& !nsdefined("NO_LOCAL_CHECK") && !nsdefined("WORK_DIRECTORY")
|
||||
&& php_sapi_name() != 'cli' ) {
|
||||
echo "SERVICE AVAILABLE ONLY VIA LOCALHOST UNTIL CONFIGURED";
|
||||
exit();
|
||||
}
|
||||
/** @ignore */
|
||||
nsdefine('PROFILE', false);
|
||||
nsdefine('DBMS', 'Sqlite3');
|
||||
nsdefine('AUTHENTICATION_MODE', NORMAL_AUTHENTICATION);
|
||||
nsdefine('RECOVERY_MODE', EMAIL_RECOVERY);
|
||||
nsdefine('DEBUG_LEVEL', NO_DEBUG_INFO);
|
||||
nsdefine('USE_FILECACHE', false);
|
||||
nsdefine('WEB_ACCESS', true);
|
||||
nsdefine('RSS_ACCESS', true);
|
||||
nsdefine('API_ACCESS', true);
|
||||
nsdefine('REGISTRATION_TYPE', 'disable_registration');
|
||||
nsdefine('USE_MAIL_PHP', true);
|
||||
nsdefine('MAIL_SERVER', '');
|
||||
nsdefine('MAIL_PORT', '');
|
||||
nsdefine('MAIL_USERNAME', '');
|
||||
nsdefine('MAIL_PASSWORD', '');
|
||||
nsdefine('MAIL_SECURITY', '');
|
||||
nsdefine('MEDIA_MODE', 'name_server');
|
||||
nsdefine('DB_NAME', "default");
|
||||
nsdefine('DB_USER', '');
|
||||
nsdefine('DB_PASSWORD', '');
|
||||
nsdefine('DB_HOST', '');
|
||||
/** @ignore */
|
||||
nsdefine('CRAWL_DIR', BASE_DIR);
|
||||
/** @ignore */
|
||||
nsdefine('LOCALE_DIR', FALLBACK_LOCALE_DIR);
|
||||
/** @ignore */
|
||||
nsdefine('LOG_DIR', BASE_DIR."/log");
|
||||
nsdefine('NAME_SERVER', "http://localhost/");
|
||||
nsdefine('USER_AGENT_SHORT', "NeedsNameBot");
|
||||
nsdefine('DEFAULT_LOCALE', "en-US");
|
||||
nsdefine('AUTH_KEY', 0);
|
||||
nsdefine('USE_MEMCACHE', false);
|
||||
nsdefine('USE_PROXY', false);
|
||||
nsdefine('TOR_PROXY', '127.0.0.1:9150');
|
||||
nsdefine('PROXY_SERVERS', null);
|
||||
nsdefine('WORD_SUGGEST', true);
|
||||
nsdefine('CACHE_LINK', true);
|
||||
nsdefine('SIMILAR_LINK', true);
|
||||
nsdefine('IN_LINK', true);
|
||||
nsdefine('IP_LINK', true);
|
||||
nsdefine('RESULT_SCORE', true);
|
||||
nsdefine('SIGNIN_LINK', true);
|
||||
nsdefine('NEWS_MODE', 'news_off');
|
||||
/** BM25F weight for title text */
|
||||
nsdefine ('TITLE_WEIGHT', 4);
|
||||
/** BM25F weight for other text within doc*/
|
||||
nsdefine ('DESCRIPTION_WEIGHT', 1);
|
||||
/** BM25F weight for other text within links to a doc*/
|
||||
nsdefine ('LINK_WEIGHT', 2);
|
||||
/** If that many exist, the minimum number of results to get
|
||||
and group before trying to compute the top x (say 10) results
|
||||
*/
|
||||
nsdefine ('MIN_RESULTS_TO_GROUP', 200);
|
||||
/** For a given number of search results total to return (total_num)
|
||||
server_alpha*total_num/num_servers will be returned any a given
|
||||
queue server machine*/
|
||||
nsdefine ('SERVER_ALPHA', 1.6);
|
||||
nsdefine('BACKGROUND_COLOR', "#FFFFFF");
|
||||
nsdefine('FOREGROUND_COLOR', "#FFFFFF");
|
||||
nsdefine('SIDEBAR_COLOR', "#88AA44");
|
||||
nsdefine('TOPBAR_COLOR', "#EEEEFF");
|
||||
nsdefine('AD_LOCATION','none');
|
||||
}
|
||||
/** URL that all url paths will be constructed from */
|
||||
nsconddefine('BASE_URL', NAME_SERVER);
|
||||
/** Relative url to website logo */
|
||||
nsconddefine('LOGO', "resources/yioop.png");
|
||||
/** Relative url to mobile website logo */
|
||||
nsconddefine('M_LOGO', "resources/m-yioop.png");
|
||||
/** Url for website favicon */
|
||||
nsconddefine('FAVICON', BASE_URL . "favicon.ico");
|
||||
/** Timezone for website */
|
||||
nsconddefine('TIMEZONE', 'America/Los_Angeles');
|
||||
/* name of the cookie used to manage the session
|
||||
(store language and perpage settings), define CSRF token
|
||||
*/
|
||||
nsconddefine('SESSION_NAME', "yioopbiscuit");
|
||||
nsconddefine('CSRF_TOKEN', "YIOOP_TOKEN");
|
||||
/** locations that ads can be placed in search result pages */
|
||||
nsconddefine('AD_LOCATION', "none");
|
||||
date_default_timezone_set(TIMEZONE);
|
||||
if ((DEBUG_LEVEL & ERROR_INFO) == ERROR_INFO) {
|
||||
error_reporting(-1);
|
||||
} else {
|
||||
error_reporting(0);
|
||||
}
|
||||
/** if true tests are diplayable*/
|
||||
nsdefine('DISPLAY_TESTS', ((DEBUG_LEVEL & TEST_INFO) == TEST_INFO));
|
||||
/** if true query statistics are diplayed */
|
||||
nsconddefine('QUERY_STATISTICS', ((DEBUG_LEVEL & QUERY_INFO) == QUERY_INFO));
|
||||
//check if mobile css and formatting should be used or not
|
||||
if (isset($_SERVER['HTTP_USER_AGENT'])) {
|
||||
$agent = $_SERVER['HTTP_USER_AGENT'];
|
||||
if ((stristr($agent, "mobile") || stristr($agent, "fennec")) &&
|
||||
!stristr($agent, "ipad") ) {
|
||||
nsdefine("MOBILE", true);
|
||||
} else {
|
||||
nsdefine("MOBILE", false);
|
||||
}
|
||||
} else {
|
||||
nsdefine("MOBILE", false);
|
||||
}
|
||||
/*
|
||||
* Various groups and user ids. These must be nsdefined before the
|
||||
* profile check and return below
|
||||
*/
|
||||
/** ID of the root user */
|
||||
nsdefine('ROOT_ID', 1);
|
||||
/**User name of the root user. If you want to change this, change
|
||||
the value in LocalConfig.php, then run the Createdb.php script. You
|
||||
should do this before you have much data in your system. */
|
||||
nsconddefine('ROOT_USERNAME', "root");
|
||||
/** Role of the root user */
|
||||
nsdefine('ADMIN_ROLE', 1);
|
||||
/** Default role of an active user */
|
||||
nsdefine('USER_ROLE', 2);
|
||||
/** Default role of an advertiser */
|
||||
nsdefine('BUSINESS_ROLE', 3);
|
||||
/** Default role of a bot user */
|
||||
nsdefine('BOT_ROLE', 4);
|
||||
/** ID of the group to which all Yioop users belong */
|
||||
nsdefine('PUBLIC_GROUP_ID', 2);
|
||||
/** ID of the group to which all Yioop users belong */
|
||||
nsdefine('PUBLIC_USER_ID', 2);
|
||||
/** ID of the group to which all Yioop Help Wiki articles belong */
|
||||
nsdefine('HELP_GROUP_ID', 3);
|
||||
/** Length of advertisement name string */
|
||||
nsdefine('ADVERTISEMENT_NAME_LEN', 25);
|
||||
/** Length of advertisement text description */
|
||||
nsdefine('ADVERTISEMENT_TEXT_LEN', 35);
|
||||
/** Length of advertisement keywords */
|
||||
nsdefine('ADVERTISEMENT_KEYWORD_LEN', 60);
|
||||
/** Length of advertisement date */
|
||||
nsdefine('ADVERTISEMENT_DATE_LEN', 20);
|
||||
/** Length of advertisement destination */
|
||||
nsdefine('ADVERTISEMENT_DESTINATION_LEN', 60);
|
||||
/** value used to create advertisement*/
|
||||
nsdefine('ADVERTISEMENT_ACTIVE_STATUS', 1);
|
||||
/** value used to stop advertisement campaign */
|
||||
nsdefine('ADVERTISEMENT_DEACTIVATED_STATUS',2);
|
||||
/** value used to admin suspend advertisement campaign */
|
||||
nsdefine('ADVERTISEMENT_SUSPENDED_STATUS',3);
|
||||
/** value used to indicate campaign completed successfully */
|
||||
nsdefine('ADVERTISEMENT_COMPLETED_STATUS',4);
|
||||
if (!PROFILE) {
|
||||
return;
|
||||
}
|
||||
/*+++ End machine generated code, feel free to edit the below as desired +++*/
|
||||
/** this is the User-Agent names the crawler provides
|
||||
* a web-server it is crawling
|
||||
*/
|
||||
nsconddefine('USER_AGENT',
|
||||
'Mozilla/5.0 (compatible; '.USER_AGENT_SHORT.'; +'.NAME_SERVER.'bot.php)');
|
||||
/**
|
||||
* To change the Open Search Tool bar name overrride the following variable
|
||||
* in your local_config.php file
|
||||
*/
|
||||
nsconddefine('SEARCHBAR_PATH', NAME_SERVER . "yioopbar.xml");
|
||||
/**
|
||||
* Phantom JS is used by some optional Javascript tests of the Yioop interface.
|
||||
* The constant PHANTOM_JS should point to the path to phantomjs
|
||||
*/
|
||||
nsconddefine("PHANTOM_JS", "phantomjs");
|
||||
/** maximum size of a log file before it is rotated */
|
||||
nsconddefine("MAX_LOG_FILE_SIZE", 5000000);
|
||||
/** number of log files to rotate amongst */
|
||||
nsconddefine("NUMBER_OF_LOG_FILES", 5);
|
||||
/**
|
||||
* how long in seconds to keep a cache of a robot.txt
|
||||
* file before re-requesting it
|
||||
*/
|
||||
nsconddefine('CACHE_ROBOT_TXT_TIME', ONE_DAY);
|
||||
/**
|
||||
* Whether the scheduler should track ETag and Expires headers.
|
||||
* If you want to turn this off set the variable to false in
|
||||
* local_config.php
|
||||
*/
|
||||
nsconddefine('USE_ETAG_EXPIRES', true);
|
||||
/**
|
||||
* if the robots.txt has a Crawl-delay larger than this
|
||||
* value don't crawl the site.
|
||||
* maximum value for this is 255
|
||||
*/
|
||||
nsconddefine('MAXIMUM_CRAWL_DELAY', 64);
|
||||
/** maximum number of active crawl-delayed hosts */
|
||||
nsconddefine('MAX_WAITING_HOSTS', 250);
|
||||
/** Minimum weight in priority queue before rebuilt */
|
||||
nsconddefine('MIN_QUEUE_WEIGHT', 1/100000);
|
||||
/** largest sized object allowed in a web archive (used to sanity check
|
||||
* reading data out of a web archive)
|
||||
*/
|
||||
nsconddefine('MAX_ARCHIVE_OBJECT_SIZE', 100000000);
|
||||
/** Treat earlier timestamps as being indexes of format version 0 */
|
||||
nsconddefine('VERSION_0_TIMESTAMP', 1369754208);
|
||||
defineMemoryProfile();
|
||||
/**
|
||||
* Code to determine how much memory current machine has
|
||||
*/
|
||||
function defineMemoryProfile()
|
||||
{
|
||||
//assume have at least 4GB on a Mac(could use vm_stat)
|
||||
$memory = 4000000000;
|
||||
if (strstr(PHP_OS, "WIN")) {
|
||||
if (function_exists("exec")) {
|
||||
exec('wmic memorychip get capacity', $memory_array);
|
||||
$memory = array_sum($memory_array);
|
||||
}
|
||||
} else if (stristr(PHP_OS, "LINUX")) {
|
||||
$data = preg_split("/\s+/", file_get_contents("/proc/meminfo"));
|
||||
$memory = 1024 * intval($data[1]);
|
||||
}
|
||||
/**
|
||||
* Factor to multiply sizes of Yioop data structures with in low ram memory
|
||||
* setting (2GB)
|
||||
*/
|
||||
nsdefine('MEMORY_LOW', 1);
|
||||
/**
|
||||
* Factor to multiply sizes of Yioop data structures with if have more than
|
||||
* (2GB)
|
||||
*/
|
||||
nsdefine('MEMORY_STANDARD', 4);
|
||||
if ($memory < 2200000000) {
|
||||
/**
|
||||
* Based on system memory, either the low or high memory factor
|
||||
*/
|
||||
nsdefine('MEMORY_PROFILE', MEMORY_LOW);
|
||||
} else {
|
||||
/**
|
||||
* @ignore
|
||||
*/
|
||||
nsdefine('MEMORY_PROFILE', MEMORY_STANDARD);
|
||||
}
|
||||
/**
|
||||
* Delay in microseconds between processing pages to try to avoid
|
||||
* CPU overheating. On some systems, you can set this to 0.
|
||||
*/
|
||||
nsconddefine('FETCHER_PROCESS_DELAY', 10000);
|
||||
}
|
||||
|
||||
/**
|
||||
* bloom filters are used to keep track of which urls are visited,
|
||||
* this parameter determines up to how many
|
||||
* urls will be stored in a single filter. Additional filters are
|
||||
* read to and from disk.
|
||||
*/
|
||||
nsconddefine('URL_FILTER_SIZE', MEMORY_PROFILE * 5000000);
|
||||
/**
|
||||
* maximum number of urls that will be held in ram
|
||||
* (as opposed to in files) in the priority queue
|
||||
*/
|
||||
nsconddefine('NUM_URLS_QUEUE_RAM', MEMORY_PROFILE * 80000);
|
||||
/** number of documents before next gen */
|
||||
nsconddefine('NUM_DOCS_PER_GENERATION', MEMORY_PROFILE * 10000);
|
||||
/** precision to round floating points document scores */
|
||||
nsconddefine('PRECISION', 10);
|
||||
/** maximum number of links to extract from a page on an initial pass*/
|
||||
nsconddefine('MAX_LINKS_TO_EXTRACT', MEMORY_PROFILE * 80);
|
||||
/** maximum number of links to keep after initial extraction*/
|
||||
nsconddefine('MAX_LINKS_PER_PAGE', 50);
|
||||
/** Estimate of the average number of links per page a document has*/
|
||||
nsconddefine('AVG_LINKS_PER_PAGE', 24);
|
||||
/** maximum number of links to consider from a sitemap page */
|
||||
nsconddefine('MAX_LINKS_PER_SITEMAP', MEMORY_PROFILE * 80);
|
||||
/** maximum number of words from links to consider on any given page */
|
||||
nsconddefine('MAX_LINKS_WORD_TEXT', 100);
|
||||
/** maximum length of urls to try to queue, this is important for
|
||||
* memory when creating schedule, since the amount of memory is
|
||||
* going to be greater than the product MAX_URL_LEN*MAX_FETCH_SIZE
|
||||
* text_processors need to promise to implement this check or rely
|
||||
* on the base class which does implement it in extractHttpHttpsUrls
|
||||
*/
|
||||
nsconddefine('MAX_URL_LEN', 512);
|
||||
/** request this many bytes out of a page -- this is the default value to
|
||||
* use if the user doesn't set this value in the page options GUI
|
||||
*/
|
||||
nsdefine('PAGE_RANGE_REQUEST', 50000);
|
||||
/**
|
||||
* When getting information from an index dictionary in word iterator
|
||||
* how many distinct generations to read in in one go
|
||||
*/
|
||||
nsconddefine('NUM_DISTINCT_GENERATIONS', 20);
|
||||
/**
|
||||
* Max number of chars to extract for description from a page to index.
|
||||
* Only words in the description are indexed. -- this is the default value
|
||||
* can be set in Page Options
|
||||
*/
|
||||
nsdefine('MAX_DESCRIPTION_LEN', 2000);
|
||||
/**
|
||||
* Allow pages to be recrawled after this many days -- this is the
|
||||
* default value to use if the user doesn't set this value in the page options
|
||||
* GUI. What this controls is how often the page url filter is deleted.
|
||||
* A nonpositive value means the filter will never be deleted.
|
||||
*/
|
||||
nsdefine('PAGE_RECRAWL_FREQUENCY', -1);
|
||||
/** number of multi curl page requests in one go */
|
||||
nsconddefine('NUM_MULTI_CURL_PAGES', 100);
|
||||
/** number of pages to extract from an archive in one go */
|
||||
nsconddefine('ARCHIVE_BATCH_SIZE', 100);
|
||||
/** time in seconds before we give up on multi page requests*/
|
||||
nsconddefine('PAGE_TIMEOUT', 30);
|
||||
/** time in seconds before we give up on a single page request*/
|
||||
nsconddefine('SINGLE_PAGE_TIMEOUT', ONE_MINUTE);
|
||||
/** max time in seconds in a process before write a log message if
|
||||
* crawlTimeoutLog is called repeatedly from a loop
|
||||
*/
|
||||
nsconddefine('LOG_TIMEOUT', 30);
|
||||
/** Number of lines of QueueServer log file to check to make sure both
|
||||
* Indexer and Scheduler are running. 6000 should be roughly 20-30 minutes
|
||||
*/
|
||||
nsconddefine('LOG_LINES_TO_RESTART', 6000);
|
||||
/** File name of file used to record last log lines when a Yioop process has
|
||||
* crashed.
|
||||
*/
|
||||
nsconddefine('CRASH_LOG_NAME', LOG_DIR . "/YioopCrashes.log");
|
||||
/**
|
||||
* Maximum time a crawl daemon process can go before calling
|
||||
* @see CrawlDaemon::processHandler
|
||||
*/
|
||||
nsconddefine('PROCESS_TIMEOUT', 15 * ONE_MINUTE);
|
||||
/**
|
||||
* Number of error page 400 or greater seen from a host before crawl-delay
|
||||
* host and dump remainder from current schedule
|
||||
*/
|
||||
nsconddefine('DOWNLOAD_ERROR_THRESHOLD', 50);
|
||||
/** Crawl-delay to set in the event that DOWNLOAD_ERROR_THRESHOLD exceeded*/
|
||||
nsconddefine('ERROR_CRAWL_DELAY', 20);
|
||||
/**
|
||||
* if FFMPEG defined, the maximum size of a uploaded video file which will
|
||||
* be automatically transcode by Yioop to mp4 and webm
|
||||
*/
|
||||
nsconddefine("MAX_VIDEO_CONVERT_SIZE", 2000000000);
|
||||
/**
|
||||
* The maximum time limit in seconds where if a file is not converted by the
|
||||
* time it will be picked up again by the client media updater
|
||||
* This value largely depends on the no of client media updaters that we have
|
||||
* and also the maximum video size that would be uploaded to yioop.
|
||||
* This value should be kept more than the sleeping time of media updater
|
||||
* loop to avoid conversion of same file multiple times.
|
||||
*/
|
||||
nsconddefine('MAX_FILE_TIMESTAMP_LIMIT', 600);
|
||||
/**
|
||||
* This mail timestamp limit allows mail server to create a new file
|
||||
* and write next mailer batch in the new file. Otherwise, new mailer
|
||||
* batch will be written in old file. For eg. new file will be created every
|
||||
* 5 minutes as per below value.
|
||||
*/
|
||||
nsconddefine('MAX_MAIL_TIMESTAMP_LIMIT', 300);
|
||||
/**
|
||||
* Default edge size of square image thumbnails in pixels
|
||||
*/
|
||||
nsconddefine('THUMB_DIM', 128);
|
||||
/**
|
||||
* Maximum size of a user thumb file that can be uploaded
|
||||
*/
|
||||
nsconddefine('THUMB_SIZE', 1000000);
|
||||
/** Characters we view as not part of words, not same as POSIX [:punct:]*/
|
||||
nsconddefine('PUNCT', "\.|\,|\:|\;|\"|\'|\[|\/|\%|\?|-|" .
|
||||
"\]|\{|\}|\(|\)|\!|\||\&|\`|" .
|
||||
"\’|\‘|©|®|™|℠|…|\/|\>|,|\=|。|)|:|、|" .
|
||||
"”|“|《|》|(|「|」|★|【|】|·|\+|\*|;".
|
||||
"|!|—|―|?|!|،|؛|؞|؟|٪|٬|٭");
|
||||
/** Number of total description deemed title */
|
||||
nsconddefine('AD_HOC_TITLE_LENGTH', 50);
|
||||
/** Used to say number of bytes in histogram bar (stats page) for file
|
||||
download sizes
|
||||
*/
|
||||
nsconddefine('DOWNLOAD_SIZE_INTERVAL', 5000);
|
||||
/** Used to say number of secs in histogram bar for file download times*/
|
||||
nsconddefine('DOWNLOAD_TIME_INTERVAL', 0.5);
|
||||
/**
|
||||
* How many non robot urls the fetcher successfully downloads before
|
||||
* between times data sent back to queue server
|
||||
*/
|
||||
nsconddefine('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', MEMORY_PROFILE * 95);
|
||||
/** maximum number of urls to schedule to a given fetcher in one go */
|
||||
nsconddefine('MAX_FETCH_SIZE', MEMORY_PROFILE * 1000);
|
||||
/** fetcher must wait at least this long between multi-curl requests */
|
||||
nsconddefine('MINIMUM_FETCH_LOOP_TIME', 5);
|
||||
/** an idling fetcher sleeps this long between queue_server pings*/
|
||||
nsconddefine('FETCH_SLEEP_TIME', 10);
|
||||
/** an a queue_server minimum loop idle time*/
|
||||
nsconddefine('QUEUE_SLEEP_TIME', 5);
|
||||
/** How often mirror script tries to synchronize with machine it is mirroring*/
|
||||
nsconddefine('MIRROR_SYNC_FREQUENCY', ONE_HOUR);
|
||||
/** How often mirror script tries to notify machine it is mirroring that it
|
||||
is still alive*/
|
||||
nsconddefine('MIRROR_NOTIFY_FREQUENCY', ONE_MINUTE);
|
||||
/** Max time before dirty index (queue_server) and
|
||||
filters (fetcher) will be force saved in seconds*/
|
||||
nsconddefine('FORCE_SAVE_TIME', ONE_HOUR);
|
||||
/** Number of seconds of no fetcher contact before crawl is deemed dead */
|
||||
nsdefine("CRAWL_TIME_OUT", 1800);
|
||||
/** maximum lenght of a search query */
|
||||
nsconddefine('MAX_QUERY_TERMS', 10);
|
||||
/** maximum number of terms allowed in a conjunctive search query */
|
||||
nsconddefine('MAX_QUERY_LEN', 4096);
|
||||
/** whether to use question answering system */
|
||||
nsconddefine('ENABLE_QUESTION_ANSWERING', true);
|
||||
/** Number of words until to switch from bag of words to phrase lookup */
|
||||
nsconddefine('PHRASE_THRESHOLD', 3);
|
||||
/** default number of search results to display per page */
|
||||
nsconddefine('NUM_RESULTS_PER_PAGE', 10);
|
||||
/** Number of recently crawled urls to display on admin screen */
|
||||
nsconddefine('NUM_RECENT_URLS_TO_DISPLAY', 10);
|
||||
/** Maximum time a set of results can stay in query cache before it is
|
||||
invalidated. If negative, then never use time to kick something out of
|
||||
cache. */
|
||||
nsconddefine('MAX_QUERY_CACHE_TIME', 2 * ONE_DAY); //two days
|
||||
/** Minimum time a set of results can stay in query cache before it is
|
||||
invalidated (used for active crawl or feed results) */
|
||||
nsconddefine('MIN_QUERY_CACHE_TIME', ONE_HOUR); //one hour
|
||||
/**
|
||||
* Default number of items to page through for users,roles, mixes, etc
|
||||
* on the admin screens
|
||||
*/
|
||||
nsconddefine('DEFAULT_ADMIN_PAGING_NUM', 50);
|
||||
/** Maximum number of bytes that the file that the suggest-a-url form
|
||||
* send data to can be.
|
||||
*/
|
||||
nsconddefine('MAX_SUGGEST_URL_FILE_SIZE', 100000);
|
||||
/** Maximum number of a user can suggest to the suggest-a-url form in one day
|
||||
*/
|
||||
nsconddefine('MAX_SUGGEST_URLS_ONE_DAY', 10);
|
||||
/** Directly add suggested urls to crawl options and inject them into any
|
||||
* active crawl. If false, these are stored in a file and the user has to
|
||||
* click a button to add them.
|
||||
*/
|
||||
nsconddefine('DIRECT_ADD_SUGGEST', false);
|
||||
/**
|
||||
* Length after which to truncate names for users/groups/roles when
|
||||
* they are displayed (not in DB)
|
||||
*/
|
||||
nsconddefine('NAME_TRUNCATE_LEN', 7);
|
||||
/** USER STATUS value used for someone who is not in a group but can browse*/
|
||||
nsdefine('NOT_MEMBER_STATUS', -1);
|
||||
/** USER STATUS value used for a user who can log in and perform activities */
|
||||
nsdefine('ACTIVE_STATUS', 1);
|
||||
/**
|
||||
* USER STATUS value used for a user whose account is created, but which
|
||||
* still needs to undergo admin or email verification/activation
|
||||
*/
|
||||
nsdefine('INACTIVE_STATUS', 2);
|
||||
/**
|
||||
* USER STATUS used to indicate an account which can no longer perform
|
||||
* activities but which might be retained to preserve old blog posts.
|
||||
*/
|
||||
nsdefine('SUSPENDED_STATUS', 3);
|
||||
/** Group status used to indicate a user that has been invited to join
|
||||
* a group but who has not yet accepted
|
||||
*/
|
||||
nsdefine('INVITED_STATUS', 4);
|
||||
/**
|
||||
* Group registration type that only allows people to join a group by
|
||||
* invitation
|
||||
*/
|
||||
nsdefine('NO_JOIN', 1);
|
||||
/**
|
||||
* Group registration type that only allows people to request a membership
|
||||
* in a group from the group's owner
|
||||
*/
|
||||
nsdefine('REQUEST_JOIN', 2);
|
||||
/**
|
||||
* Group registration type that only allows people to request a membership
|
||||
* in a group from the group's owner, but allows people to browse the groups
|
||||
* content without join
|
||||
*/
|
||||
nsdefine('PUBLIC_BROWSE_REQUEST_JOIN', 3);
|
||||
/**
|
||||
* Group registration type that allows anyone to obtain membership
|
||||
* in the group
|
||||
*/
|
||||
nsdefine('PUBLIC_JOIN', 4);
|
||||
/**
|
||||
* Group access code signifying only the group owner can
|
||||
* read items posted to the group or post new items
|
||||
*/
|
||||
nsdefine('GROUP_PRIVATE', 1);
|
||||
/**
|
||||
* Group access code signifying members of the group can
|
||||
* read items posted to the group but only the owner can post
|
||||
* new items
|
||||
*/
|
||||
nsdefine('GROUP_READ', 2);
|
||||
/**
|
||||
* Group access code signifying members of the group can
|
||||
* read items posted to the group but only the owner can post
|
||||
* new items
|
||||
*/
|
||||
nsdefine('GROUP_READ_COMMENT', 3);
|
||||
/**
|
||||
* Group access code signifying members of the group can both
|
||||
* read items posted to the group as well as post new items
|
||||
*/
|
||||
nsdefine('GROUP_READ_WRITE', 4);
|
||||
/**
|
||||
* Group access code signifying members of the group can both
|
||||
* read items posted to the group as well as post new items
|
||||
* and can edit the group's wiki
|
||||
*/
|
||||
nsdefine('GROUP_READ_WIKI', 5);
|
||||
/**
|
||||
* Indicates a group where people can't up and down vote threads
|
||||
*/
|
||||
nsdefine("NON_VOTING_GROUP", 0);
|
||||
/**
|
||||
* Indicates a group where people can vote up threads (but not down)
|
||||
*/
|
||||
nsdefine("UP_VOTING_GROUP", 1);
|
||||
/**
|
||||
* Indicates a group where people can vote up and down threads
|
||||
*/
|
||||
nsdefine("UP_DOWN_VOTING_GROUP", 2);
|
||||
/**
|
||||
* Typical posts to a group feed are on user created threads and
|
||||
* so are of this type
|
||||
*/
|
||||
nsdefine('STANDARD_GROUP_ITEM', 0);
|
||||
/**
|
||||
* Indicates the thread was created to go alongside the creation of a wiki
|
||||
* page so that people can discuss the pages contents
|
||||
*/
|
||||
nsdefine('WIKI_GROUP_ITEM', 1);
|
||||
/**
|
||||
* Used to record that a page belongs to the template category
|
||||
*/
|
||||
nsdefine('WIKI_STANDARD_LINK', -1);
|
||||
/**
|
||||
* Used to record that a page belongs to the template category
|
||||
*/
|
||||
nsdefine('WIKI_TEMPLATE_LINK', -2);
|
||||
/**
|
||||
* set to true if Multiple news updaters are running
|
||||
* otherwise set to false if name server is running the news updater
|
||||
*/
|
||||
nsconddefine('SEND_MAIL_MEDIA_UPDATER', false);
|
||||
/**
|
||||
* Impression type used to record one view of a thread
|
||||
*/
|
||||
nsdefine('THREAD_IMPRESSION', 1);
|
||||
/**
|
||||
* Impression type used to record one view of a wiki page
|
||||
*/
|
||||
nsdefine('WIKI_IMPRESSION', 2);
|
||||
/**
|
||||
* Impression type used to record one thread or wiki page view in a group
|
||||
*/
|
||||
nsdefine('GROUP_IMPRESSION', 3);
|
||||
/**
|
||||
* Impression type used to record one search query view
|
||||
*/
|
||||
nsdefine('QUERY_IMPRESSION', 4);
|
||||
/**
|
||||
* Used to control update frequency of impression analytic data when
|
||||
* media updater in use
|
||||
*/
|
||||
nsconddefine("ANALYTICS_UPDATE_INTERVAL", ONE_HOUR / 6);
|
||||
/** Value of epsilon in differential privacy formula */
|
||||
nsconddefine('PRIVACY_EPSILON', 0.01);
|
||||
/** Flag to turn on/off search impression recording */
|
||||
nsconddefine('SEARCH_ANALYTICS', true);
|
||||
/** Flag to turn on/off group impression recording */
|
||||
nsconddefine('GROUP_ANALYTICS', true);
|
||||
/** Flag to turn on/off differential privacy */
|
||||
nsconddefine('DIFFERENTIAL_PRIVACY', false);
|
||||
/*
|
||||
* Database Field Sizes
|
||||
*/
|
||||
/* Length for names of things like first name, last name, etc */
|
||||
nsdefine('NAME_LEN', 32);
|
||||
/** Used for lengths of media sources, passwords, and emails */
|
||||
nsdefine('LONG_NAME_LEN', 64);
|
||||
/** Length for names of things like group names, etc */
|
||||
nsdefine('SHORT_TITLE_LEN', 128);
|
||||
/** Length for names of things like titles of blog entries, etc */
|
||||
nsdefine('TITLE_LEN', 512);
|
||||
/** Length of a feed item or post, etc */
|
||||
nsdefine('MAX_GROUP_POST_LEN', 8192);
|
||||
/** Length for for the contents of a wiki_page */
|
||||
nsdefine('MAX_GROUP_PAGE_LEN', 524288);
|
||||
/** Length for base 64 encode timestamps */
|
||||
nsdefine('TIMESTAMP_LEN', 11);
|
||||
/** Length for timestamps down to microseconds */
|
||||
nsdefine('MICROSECOND_TIMESTAMP_LEN', 20);
|
||||
/** Length for a CAPTCHA */
|
||||
nsdefine('CAPTCHA_LEN', 6);
|
||||
/** Length for a number field */
|
||||
nsdefine('MAX_IP_ADDRESS_AS_STRING_LEN', 39);
|
||||
/** Length for a number field */
|
||||
nsdefine('NUM_FIELD_LEN', 4);
|
||||
/** Length for writing mode in locales */
|
||||
nsdefine('WRITING_MODE_LEN', 5);
|
||||
/** Length of zero knowledge password string */
|
||||
nsdefine('ZKP_PASSWORD_LEN', 200);
|
||||
/*
|
||||
* Adjustable AD RELATED defines
|
||||
*
|
||||
/** Truncate length for ad description and keywords*/
|
||||
nsdefine('ADVERTISEMENT_TRUNCATE_LEN', 8);
|
||||
/** Initial bid amount for advertisement keyword */
|
||||
nsconddefine('AD_KEYWORD_INIT_BID',1);
|
||||
/** Allows the root account to purchase free ad credits. Might
|
||||
* mess up the value of credits if allow. This only makes a difference
|
||||
* in the presence of an ad processing script
|
||||
*/
|
||||
nsconddefine('ALLOW_FREE_ROOT_CREDIT_PURCHASE', false);
|
||||
/** advertisement date format for start date and end date*/
|
||||
nsconddefine('AD_DATE_FORMAT','Y-m-d');
|
||||
/** advertisement logo*/
|
||||
nsconddefine('AD_LOGO','resources/adv-logo.png');
|
||||
/** sentence compression enabled or not*/
|
||||
nsconddefine('SENTENCE_COMPRESSION_ENABLED', false);
|
583
src/configs/ConfigureTool.php
Normal file
583
src/configs/ConfigureTool.php
Normal file
|
@ -0,0 +1,583 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Used to create and manipulate a profile and work directory from the
|
||||
* command-line for Yioop.
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\configs;
|
||||
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\controllers\AdminController;
|
||||
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
/** Loads common utility functions*/
|
||||
require_once __DIR__."/../library/Utility.php";
|
||||
/** Loads common constants for web crawling*/
|
||||
require_once __DIR__."/../library/LocaleFunctions.php";
|
||||
mb_internal_encoding("UTF-8");
|
||||
mb_regex_encoding("UTF-8");
|
||||
/**
|
||||
* shorthand for echo
|
||||
*
|
||||
* @param string $text string to send to the current output
|
||||
*/
|
||||
function e($text)
|
||||
{
|
||||
echo $text;
|
||||
}
|
||||
$locale_tag = L\guessLocale();
|
||||
$locale = null;
|
||||
L\setLocaleObject($locale_tag);
|
||||
/**
|
||||
* Provides a command-line interface way to configure a Yioop Instance.
|
||||
* Unlike the web interface this interface is English-only.
|
||||
*/
|
||||
class ConfigureTool
|
||||
{
|
||||
/**
|
||||
* Used to hold an AdminController object used to manipulate the
|
||||
* Yioop configuration
|
||||
* @var object
|
||||
*/
|
||||
public $admin;
|
||||
/**
|
||||
* Holds the main menu data for the configuration tool
|
||||
* @var array
|
||||
*/
|
||||
public $menu = ["workDirectory" => "Create/Set Work Directory",
|
||||
"rootPassword" => "Change root password",
|
||||
"defaultLocale"=> "Set Default Locale",
|
||||
"debugDisplay"=> "Debug Display Set-up",
|
||||
"searchAccess"=> "Search Access Set-up",
|
||||
"searchPageElementLinks" => "Search Page Elements and Links",
|
||||
"nameServer" => "Name Server Set-up",
|
||||
"robotSetUp"=> "Crawl Robot Set-up",
|
||||
"quit" => "Exit program"
|
||||
];
|
||||
/**
|
||||
* To change configuration parameters of Yioop, this program
|
||||
* invokes AdminController methods. These methods expect, data
|
||||
* passed to them in super globals set up as a result of an HTTP
|
||||
* request. This program fakes the settings of these variables.
|
||||
* To keep things simple this constructor initializes each of the
|
||||
* relevant super globals to be empty arrays.
|
||||
*/
|
||||
function __construct()
|
||||
{
|
||||
$_REQUEST = [];
|
||||
$_POST = [];
|
||||
$_GET = [];
|
||||
$_SERVER = [];
|
||||
$_SESSION = [];
|
||||
$this->admin = new AdminController();
|
||||
}
|
||||
/**
|
||||
* This is the main loop where options of what the user can configure
|
||||
* are presented, a choice is requested, and so on...
|
||||
*/
|
||||
function loop()
|
||||
{
|
||||
$done = false;
|
||||
$activities = array_keys($this->menu);
|
||||
$activities[] = "configureMenu";
|
||||
$state = "configureMenu";
|
||||
while($state != "quit") {
|
||||
if (in_array($state, $activities) ) {
|
||||
$state = $this->$state();
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* This is used to draw the main configuration menu and ask for a
|
||||
* user selection
|
||||
*/
|
||||
function configureMenu()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
e("Checking Yioop configuration...".
|
||||
"\n===============================\n");
|
||||
$check_status = str_replace("<br />", "\n", $data["SYSTEM_CHECK"]);
|
||||
e($check_status."\n===============================\n");
|
||||
|
||||
$items = ["workDirectory" => "Create/Set Work Directory",
|
||||
"quit" => "Exit program"];
|
||||
if ($data["PROFILE"]) {
|
||||
$items = $this->menu;
|
||||
}
|
||||
return $this->drawChooseItems($items, "configureMenu");
|
||||
}
|
||||
/**
|
||||
* Used to create/change the location of this Yioop instances work
|
||||
* directory
|
||||
*/
|
||||
function workDirectory()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
$directory = (isset($data["WORK_DIRECTORY"]) &&
|
||||
$data["WORK_DIRECTORY"] != "") ? $data["WORK_DIRECTORY"]
|
||||
: "No value set yet.";
|
||||
e("CURRENT WORK DIRECTORY: $directory\n\n");
|
||||
e("Enter a new value:\n");
|
||||
if (!isset($_SERVER['REQUEST_URI'])) {
|
||||
$_SERVER['REQUEST_URI'] = "";
|
||||
}
|
||||
$this->prepareGlobals($data);
|
||||
$_REQUEST["WORK_DIRECTORY"] = L\readInput();
|
||||
$_REQUEST["arg"] = "directory";
|
||||
$next_menu = $this->confirmChange("configure", "workDirectory");
|
||||
return $next_menu;
|
||||
}
|
||||
/**
|
||||
* Used to change the password of the root account of this Yioop Instance
|
||||
*/
|
||||
function rootPassword()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
if ($data["PROFILE"] != 1) {
|
||||
$_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
|
||||
return "configureMenu";
|
||||
}
|
||||
e("Enter old password:");
|
||||
$_REQUEST["password"] = L\readPassword();
|
||||
e("Enter new password:");
|
||||
$_REQUEST["new_password"] = L\readPassword();
|
||||
e("Re-Enter new password:");
|
||||
$_REQUEST["retype_password"] = L\readPassword();
|
||||
$_SESSION['USER_ID'] = ROOT_ID;
|
||||
$_REQUEST['arg'] = "updateuser";
|
||||
$_REQUEST['edit_pass'] = "true";
|
||||
$next_menu = $this->confirmChange("manageAccount", "rootPassword");
|
||||
return $next_menu;
|
||||
}
|
||||
/**
|
||||
* Changes the default locale (language) used by Yioop when it cannot
|
||||
* determine that information from the users browswer
|
||||
*/
|
||||
function defaultLocale()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
if ($data["PROFILE"] != 1) {
|
||||
$_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
|
||||
return "configureMenu";
|
||||
}
|
||||
e("CURRENT LANGUAGE: ".$data["LANGUAGES"][
|
||||
$data["DEFAULT_LOCALE"]]."\n\n");
|
||||
$_SESSION = [];
|
||||
$items = $data["LANGUAGES"];
|
||||
$items["configureMenu"] = "Return to Main Menu";
|
||||
|
||||
do {
|
||||
$choice = $this->drawChooseItems($items, "defaultLocale");
|
||||
} while( $choice == "defaultLocale");
|
||||
|
||||
$this->prepareGlobals($data);
|
||||
if ($choice == "configureMenu") {
|
||||
$_REQUEST = [];
|
||||
$_SERVER = [];
|
||||
return "configureMenu";
|
||||
}
|
||||
$_REQUEST["DEFAULT_LOCALE"] = $choice;
|
||||
return "defaultLocale";
|
||||
}
|
||||
/**
|
||||
* Used to configure debugging information for this Yioop instance.
|
||||
* i.e., whether PHP notices, warnings, errors, should be displayed,
|
||||
* whether query statistics and info should be displayed, and whether
|
||||
* unit tests should be viewable from the web
|
||||
*/
|
||||
function debugDisplay()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
if ($data["PROFILE"] != 1) {
|
||||
$_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
|
||||
return "configureMenu";
|
||||
}
|
||||
e("CURRENT DEBUG SETTINGS\n======================\n");
|
||||
$dlevel = $data["DEBUG_LEVEL"];
|
||||
$setting = ($dlevel & ERROR_INFO) ? "On" : "Off";
|
||||
e("Error Info: [$setting]\n");
|
||||
$setting = ($dlevel & QUERY_INFO) ? "On" : "Off";
|
||||
e("Query Info: [$setting]\n");
|
||||
$setting = ($dlevel & TEST_INFO) ? "On" : "Off";
|
||||
e("Test Info: [$setting]\n");
|
||||
$items = ["ERROR_INFO" => "Toggle Error Info",
|
||||
"QUERY_INFO" => "Toggle Query Info",
|
||||
"TEST_INFO" => "Toggle Test Info",
|
||||
"configureMenu" => "Return to Main Menu"];
|
||||
do {
|
||||
$choice = $this->drawChooseItems($items, "debugDisplay");
|
||||
} while( $choice == "debugDisplay");
|
||||
$this->prepareGlobals($data);
|
||||
if ($choice == "configureMenu") {
|
||||
$_REQUEST = [];
|
||||
$_SERVER = [];
|
||||
return "configureMenu";
|
||||
}
|
||||
$flag = constant($choice);
|
||||
$dlevel = ($dlevel & $flag) ? $dlevel - $flag : $dlevel + $flag;
|
||||
if ($dlevel & ERROR_INFO) {$_REQUEST["ERROR_INFO"] = true;}
|
||||
if ($dlevel & QUERY_INFO) {$_REQUEST["QUERY_INFO"] = true;}
|
||||
if ($dlevel & TEST_INFO) {$_REQUEST["TEST_INFO"] = true;}
|
||||
return "debugDisplay";
|
||||
}
|
||||
/**
|
||||
* Configures which methods are allowed by this Yioop instance to access
|
||||
* search results, (via the web, via open rss search results, via the
|
||||
* API)
|
||||
*/
|
||||
function searchAccess()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
if ($data["PROFILE"] != 1) {
|
||||
$_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
|
||||
return "configureMenu";
|
||||
}
|
||||
e("CURRENT SEARCH ACCESS SETTINGS\n==============================\n");
|
||||
$settings = ["WEB_ACCESS" => "Web",
|
||||
"RSS_ACCESS" => "RSS", "API_ACCESS" => "API"];
|
||||
$items = [];
|
||||
foreach ($settings as $setting => $setting_string) {
|
||||
$toggle = ($data[$setting]) ? "On" : "Off";
|
||||
e("$setting_string: [$toggle]\n");
|
||||
$items[$setting] = "Toggle $setting_string";
|
||||
}
|
||||
$items["configureMenu"] = "Return to Main Menu";
|
||||
do {
|
||||
$choice = $this->drawChooseItems($items, "searchAccess");
|
||||
} while( $choice == "searchAccess");
|
||||
$this->prepareGlobals($data);
|
||||
if ($choice == "configureMenu") {
|
||||
$_REQUEST = [];
|
||||
$_SERVER = [];
|
||||
return "configureMenu";
|
||||
}
|
||||
$_REQUEST[$choice] = ($data[$choice]) ? false : true;
|
||||
return "searchAccess";
|
||||
}
|
||||
/**
|
||||
* Configures which of the various links of the SERPS page such as
|
||||
* Cache, etc should be displayed. Also, configures whether the signin
|
||||
* links, etc should be displayed.
|
||||
*/
|
||||
function searchPageElementLinks()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
if ($data["PROFILE"] != 1) {
|
||||
$_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
|
||||
return "configureMenu";
|
||||
}
|
||||
e("CURRENT SEARCH PAGE ELEMENTS AND LINKS SETTINGS".
|
||||
"\n===================================================\n");
|
||||
$settings = ["WORD_SUGGEST" => "Word Suggest",
|
||||
"SUBSEARCH_LINK" => "Subsearch Links",
|
||||
"SIGNIN_LINK" => "Sign-in Links", "CACHE_LINK" => "Cache Link",
|
||||
"SIMILAR_LINK" => "Similar Link", "IN_LINK" => "Inlinks",
|
||||
"IP_LINK"=> "IP Links"];
|
||||
$items = [];
|
||||
foreach ($settings as $setting => $setting_string) {
|
||||
$toggle = ($data[$setting]) ? "On" : "Off";
|
||||
e("$setting_string: [$toggle]\n");
|
||||
$items[$setting] = "Toggle $setting_string";
|
||||
}
|
||||
$items["configureMenu"] = "Return to Main Menu";
|
||||
do {
|
||||
$choice = $this->drawChooseItems($items, "searchPageElementLinks");
|
||||
} while( $choice == "searchPageElementLinks");
|
||||
$this->prepareGlobals($data);
|
||||
if ($choice == "configureMenu") {
|
||||
$_REQUEST = [];
|
||||
$_SERVER = [];
|
||||
return "configureMenu";
|
||||
}
|
||||
$_REQUEST[$choice] = ($data[$choice]) ? false : true;
|
||||
return "searchPageElementLinks";
|
||||
}
|
||||
/**
|
||||
* Configures settings relating to the location of the name server and
|
||||
* the salt used when communicating with it. Also, configures caching
|
||||
* mechanisms the name server should use when returning results.
|
||||
*/
|
||||
function nameServer()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
if ($data["PROFILE"] != 1) {
|
||||
$_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
|
||||
return "configureMenu";
|
||||
}
|
||||
e("NAME SERVER SETTINGS\n====================\n");
|
||||
e("Server Key: [".$data["AUTH_KEY"]."]\n");
|
||||
e("Name Server URL: [".$data["NAME_SERVER"]."]\n");
|
||||
$settings = ["USE_FILECACHE" => "Use File Cache",
|
||||
"USE_MEMCACHE" => "Use Memcache"];
|
||||
$items = ["serverKey" => "Edit Server Key",
|
||||
"nameServer" => "Edit Name Server Url"];
|
||||
foreach ($settings as $setting => $setting_string) {
|
||||
$toggle = ($data[$setting]) ? "On" : "Off";
|
||||
e("$setting_string: [$toggle]\n");
|
||||
$items[$setting] = "Toggle $setting_string";
|
||||
}
|
||||
e("\nMemcache Servers:\n=================\n".$data["MEMCACHE_SERVERS"].
|
||||
"\n=================\n");
|
||||
$items["memcacheServers"] = "Edit Memcache Servers";
|
||||
$items["configureMenu"] = "Return to Main Menu";
|
||||
do {
|
||||
$choice = $this->drawChooseItems($items, "nameServerMenu");
|
||||
} while( $choice == "nameServerMenu");
|
||||
$this->prepareGlobals($data);
|
||||
switch ($choice) {
|
||||
case "configureMenu":
|
||||
$_REQUEST = [];
|
||||
$_SERVER = [];
|
||||
return "configureMenu";
|
||||
break;
|
||||
case "serverKey":
|
||||
e("Enter a new server key: ");
|
||||
$_REQUEST["AUTH_KEY"] = L\readInput();
|
||||
break;
|
||||
case "nameServer":
|
||||
e("Enter a new name server url: ");
|
||||
$_REQUEST["NAME_SERVER"] = L\readInput();
|
||||
break;
|
||||
case "memcacheServers":
|
||||
e("Enter memcache servers, one per line.\n".
|
||||
"Terminate input with a line with only '.' on it:\n");
|
||||
$_REQUEST["MEMCACHE_SERVERS"] = L\readMessage();
|
||||
break;
|
||||
default:
|
||||
$_REQUEST[$choice] = ($data[$choice]) ? false : true;
|
||||
}
|
||||
return "nameServer";
|
||||
}
|
||||
/**
|
||||
* Used to set up the name of this instance of the Yioop robot as well
|
||||
* as its description page.
|
||||
*/
|
||||
function robotSetUp()
|
||||
{
|
||||
$this->banner();
|
||||
$data = $this->callConfigure();
|
||||
if ($data["PROFILE"] != 1) {
|
||||
$_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
|
||||
return "configureMenu";
|
||||
}
|
||||
e("CRAWL ROBOT SETTINGS\n====================\n");
|
||||
e("Crawl Robot Name: [".$data["USER_AGENT_SHORT"]."]\n");
|
||||
e("Robot Instance: [".$data["ROBOT_INSTANCE"]."]\n");
|
||||
e("\nRobot Description:\n=================\n".
|
||||
$data["ROBOT_DESCRIPTION"] . "\n=================\n");
|
||||
$items = ["robotName" => "Edit Robot Name",
|
||||
"robotInstance" => "Edit Robot Instance",
|
||||
"robotDescription" => "Edit Robot Description",
|
||||
"configureMenu" => "Return to Main Menu"];
|
||||
do {
|
||||
$choice = $this->drawChooseItems($items, "robotSetUp");
|
||||
} while( $choice == "robotSetUp");
|
||||
$this->prepareGlobals($data);
|
||||
switch ($choice) {
|
||||
case "configureMenu":
|
||||
$_REQUEST = [];
|
||||
$_SERVER = [];
|
||||
return "configureMenu";
|
||||
break;
|
||||
case "robotName":
|
||||
e("Enter a new robot name: ");
|
||||
$_REQUEST["USER_AGENT_SHORT"] = L\readInput();
|
||||
break;
|
||||
case "robotInstance":
|
||||
e("Enter a new robot instance value: ");
|
||||
$_REQUEST["ROBOT_INSTANCE"] = L\readInput();
|
||||
break;
|
||||
case "robotDescription":
|
||||
e("Enter a description of your web crawler robot.\n".
|
||||
"Terminate input with a line with only '.' on it:\n");
|
||||
$_REQUEST["ROBOT_DESCRIPTION"] = L\readMessage();
|
||||
break;
|
||||
}
|
||||
return "robotSetUp";
|
||||
}
|
||||
/**
|
||||
* Used to select to confirm, cancel, or re-enter the last profile
|
||||
* change
|
||||
*
|
||||
* @param string $admin_method to call if confirmed
|
||||
* @param string $reenter_method , return value if reenter chosen
|
||||
* @return string menu name to do to next
|
||||
*/
|
||||
function confirmChange($admin_method, $reenter_method)
|
||||
{
|
||||
$component_activities = AdminController::$component_activities;
|
||||
$items = ["confirm" => "Confirm Change",
|
||||
"reenter" => "Re-enter the information",
|
||||
"configureMenu" => "Return to the Configure Menu"];
|
||||
$first = true;
|
||||
do {
|
||||
$choice = $this->drawChooseItems($items, "confirmChange");
|
||||
} while( $choice == "confirmChange");
|
||||
switch ($choice) {
|
||||
case "confirm":
|
||||
$component = "system";
|
||||
foreach ($component_activities as $available_component =>
|
||||
$activities) {
|
||||
if (in_array($admin_method, $activities)) {
|
||||
$component = $available_component;
|
||||
break;
|
||||
}
|
||||
}
|
||||
$data = $this->admin->component($component)->$admin_method();
|
||||
$_SERVER = [];
|
||||
$_SESSION = [];
|
||||
$_REQUEST = [];
|
||||
$_REQUEST["MESSAGE"] = $data["MESSAGE"];
|
||||
$next_menu = "configureMenu";
|
||||
break;
|
||||
case "reenter":
|
||||
$_SERVER = [];
|
||||
$_SESSION = [];
|
||||
$_REQUEST = [];
|
||||
$next_menu = $reenter_method;
|
||||
break;
|
||||
default:
|
||||
$_SERVER = [];
|
||||
$_SESSION = [];
|
||||
$_REQUEST = [];
|
||||
$next_menu = "configureMenu";
|
||||
}
|
||||
return $next_menu;
|
||||
}
|
||||
/**
|
||||
* Draws a list of options to the screen and gets a choice
|
||||
* from this list from the user.
|
||||
*
|
||||
* @param array $items as associative array (return value => description)
|
||||
* @param string $currentView value to return if invalid choice made
|
||||
* @return string a choice from the user
|
||||
*/
|
||||
function drawChooseItems($items, $currentView)
|
||||
{
|
||||
$choice_nums = [];
|
||||
$i = 1;
|
||||
e("\nAvailable Options:\n==================\n");
|
||||
foreach ($items as $name => $description) {
|
||||
e("($i) $description\n");
|
||||
$choice_nums[$i] = $name;
|
||||
$i++;
|
||||
}
|
||||
if (!empty($_REQUEST["MESSAGE"])) {
|
||||
e("\n+++ ".$_REQUEST["MESSAGE"]." +++\n");
|
||||
unset($_REQUEST["MESSAGE"]);
|
||||
}
|
||||
e("\nPlease choose an option:\n");
|
||||
$user_data = strtolower(trim(L\readInput()));
|
||||
|
||||
if ($user_data >= 1 && $user_data < $i) {
|
||||
$_REQUEST["MESSAGE"] = "";
|
||||
return $choice_nums[$user_data];
|
||||
} else {
|
||||
$_REQUEST["MESSAGE"] = "Invalid choice. Please choose again.";
|
||||
return $currentView;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to call system components configure method. It detects if
|
||||
* a redirect happened by the fact that $data['PROFILE'] is not set.
|
||||
* If so it passes along the redirect message and re-calls configure()
|
||||
*/
|
||||
function callConfigure()
|
||||
{
|
||||
$data = $this->admin->component("system")->configure();
|
||||
if (!isset($data["PROFILE"])) {
|
||||
$_REQUEST = [];
|
||||
$message = (isset($data['MESSAGE'])) ? $data['MESSAGE'] : "";
|
||||
$data = $this->admin->component("system")->configure();
|
||||
$data['MESSAGE'] = $message;
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
/**
|
||||
* Prints the banner used by this configuration tool
|
||||
*/
|
||||
function banner()
|
||||
{
|
||||
e(chr(27) . "[2J" . chr(27) . "[;H");
|
||||
e("\n\nYIOOP! CONFIGURATION TOOL\n");
|
||||
e("+++++++++++++++++++++++++\n\n");
|
||||
}
|
||||
/**
|
||||
* Sets-up the field values of the super globals used by AdminController
|
||||
* when changing a profile or managing passwords. These particular
|
||||
* values don't change with respect to what this tool does.
|
||||
*
|
||||
* @param array $data current profile state
|
||||
*/
|
||||
function prepareGlobals($data)
|
||||
{
|
||||
$_SESSION = [];
|
||||
$_REQUEST = $this->copyProfileFields($data);
|
||||
$_REQUEST["arg"] = "profile";
|
||||
$_REQUEST['YIOOP_TOKEN'] = "";
|
||||
if (!isset($_SERVER['REQUEST_URI'])) {
|
||||
if (!empty($data['WEB_URI'])) {
|
||||
$_SERVER['REQUEST_URI'] = $data['WEB_URI'];
|
||||
} else {
|
||||
e("Enter web path for Yioop instance:\n");
|
||||
$_SERVER['REQUEST_URI'] = L\readInput();
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to copy the contents of $data which are profile fields to a
|
||||
* new array.
|
||||
*
|
||||
* @param array $data an array of profile and other fields
|
||||
* @return array a new array containing a copy of just the profile fields
|
||||
* from the orginal array
|
||||
*/
|
||||
function copyProfileFields($data)
|
||||
{
|
||||
$profile = [];
|
||||
foreach ($this->admin->model("profile")->profile_fields as $field) {
|
||||
if (isset($data[$field])) {
|
||||
$profile[$field] = $data[$field];
|
||||
}
|
||||
}
|
||||
return $profile;
|
||||
}
|
||||
}
|
||||
$configure_tool = new ConfigureTool();
|
||||
$configure_tool->loop();
|
546
src/configs/Createdb.php
Normal file
546
src/configs/Createdb.php
Normal file
|
@ -0,0 +1,546 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* This script can be used to set up the database and filesystem for the
|
||||
* seekquarry database system. The SeekQuarry system is deployed with a
|
||||
* minimal sqlite database so this script is not strictly needed.
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\configs;
|
||||
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\models\Model;
|
||||
use seekquarry\yioop\models\ProfileModel;
|
||||
use seekquarry\yioop\models\GroupModel;
|
||||
|
||||
if (!empty($_SERVER['DOCUMENT_ROOT'])) {
|
||||
echo "BAD REQUEST";
|
||||
exit();
|
||||
}
|
||||
/** For crawlHash function */
|
||||
require_once __DIR__."/../library/Utility.php";
|
||||
/** For wiki page translation stuff */
|
||||
require_once __DIR__."/../library/LocaleFunctions.php";
|
||||
/** To make it easy to insert translations */
|
||||
require_once __DIR__."/../library/UpgradeFunctions.php";
|
||||
$profile_model = new ProfileModel(DB_NAME, false);
|
||||
$db_class = NS_DATASOURCES . ucfirst(DBMS)."Manager";
|
||||
$dbinfo = ["DBMS" => DBMS, "DB_HOST" => DB_HOST, "DB_USER" => DB_USER,
|
||||
"DB_PASSWORD" => DB_PASSWORD, "DB_NAME" => DB_NAME];
|
||||
if (!in_array(DBMS, ['sqlite', 'sqlite3'])) {
|
||||
$db = new $db_class();
|
||||
$db->connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME);
|
||||
/* postgres doesn't let you drop a database while connected to it so drop
|
||||
tables instead first
|
||||
*/
|
||||
$profile_model->initializeSql($db, $dbinfo);
|
||||
$database_tables = array_keys($profile_model->create_statements);
|
||||
foreach ($database_tables as $table) {
|
||||
$db->execute("DROP TABLE ".$table);
|
||||
}
|
||||
$db->execute("DROP DATABASE IF EXISTS " . DB_NAME);
|
||||
$db->execute("CREATE DATABASE " . DB_NAME);
|
||||
$db->disconnect();
|
||||
$db->connect(); // default connection goes to actual DB
|
||||
} else {
|
||||
@unlink(CRAWL_DIR."/data/" . DB_NAME . ".db");
|
||||
$db = new $db_class();
|
||||
$db->connect();
|
||||
}
|
||||
if (!$profile_model->createDatabaseTables($db, $dbinfo)) {
|
||||
echo "\n\nCouldn't create database tables!!!\n\n";
|
||||
exit();
|
||||
}
|
||||
$db->execute("INSERT INTO VERSION VALUES (" . YIOOP_VERSION . ")");
|
||||
$creation_time = L\microTimestamp();
|
||||
//numerical value of the blank password
|
||||
$profile = $profile_model->getProfile(WORK_DIRECTORY);
|
||||
$new_profile = $profile;
|
||||
$new_profile['FIAT_SHAMIR_MODULUS'] = L\generateFiatShamirModulus();
|
||||
$profile_model->updateProfile(WORK_DIRECTORY, $new_profile, $profile);
|
||||
if ($new_profile['FIAT_SHAMIR_MODULUS']) {
|
||||
$sha1_of_blank_string = L\bchexdec(sha1(''));
|
||||
//calculating V = S ^ 2 mod N
|
||||
$temp = bcpow($sha1_of_blank_string . '', '2');
|
||||
$zkp_password = ($new_profile['FIAT_SHAMIR_MODULUS']) ?
|
||||
bcmod($temp, $new_profile['FIAT_SHAMIR_MODULUS']) : "";
|
||||
} else {
|
||||
$sha1_of_blank_string = "";
|
||||
$zkp_password = "";
|
||||
}
|
||||
//default account is root without a password
|
||||
$sql ="INSERT INTO USERS VALUES (" . ROOT_ID . ", 'admin', 'admin','" .
|
||||
ROOT_USERNAME . "',
|
||||
'root@dev.null', '".L\crawlCrypt('')."', '".ACTIVE_STATUS.
|
||||
"', '".L\crawlCrypt(ROOT_USERNAME . AUTH_KEY . $creation_time).
|
||||
"', 0,'$creation_time', 0, 0, '$zkp_password')";
|
||||
$db->execute($sql);
|
||||
/* public account is an inactive account for used for public permissions
|
||||
default account is root without a password
|
||||
*/
|
||||
$sql ="INSERT INTO USERS VALUES (".PUBLIC_USER_ID.", 'all', 'all','public',
|
||||
'public@dev.null', '".L\crawlCrypt('')."', '".INACTIVE_STATUS.
|
||||
"', '".L\crawlCrypt('public' . AUTH_KEY . $creation_time)."', 0,
|
||||
'$creation_time', 0, 0, '$zkp_password')";
|
||||
$db->execute($sql);
|
||||
//default public group with group id 1
|
||||
$creation_time = L\microTimestamp();
|
||||
$sql = "INSERT INTO GROUPS VALUES(".PUBLIC_GROUP_ID.",'Public','".
|
||||
$creation_time."','".ROOT_ID."', '".PUBLIC_JOIN."', '".GROUP_READ.
|
||||
"', ".NON_VOTING_GROUP.", " . FOREVER . ")";
|
||||
$db->execute($sql);
|
||||
$now = time();
|
||||
$db->execute("INSERT INTO ROLE VALUES (".ADMIN_ROLE.", 'Admin' )");
|
||||
$db->execute("INSERT INTO ROLE VALUES (".USER_ROLE.", 'User' )");
|
||||
$db->execute("INSERT INTO ROLE VALUES (".BUSINESS_ROLE.", 'Business User' )");
|
||||
$db->execute("INSERT INTO ROLE VALUES (".BOT_ROLE.", 'Bot User' )");
|
||||
$db->execute("INSERT INTO USER_ROLE VALUES (".ROOT_ID.", ".ADMIN_ROLE.")");
|
||||
$db->execute("INSERT INTO USER_GROUP VALUES (".ROOT_ID.", ".
|
||||
PUBLIC_GROUP_ID.", ".ACTIVE_STATUS.", $now)");
|
||||
$db->execute("INSERT INTO USER_GROUP VALUES (".PUBLIC_USER_ID.", ".
|
||||
PUBLIC_GROUP_ID.", ".ACTIVE_STATUS.", $now)");
|
||||
//Create a Group for Wiki HELP.
|
||||
$sql = "INSERT INTO GROUPS VALUES(" . HELP_GROUP_ID . ",'Help','" .
|
||||
$creation_time . "','" . ROOT_ID . "',
|
||||
'" . PUBLIC_BROWSE_REQUEST_JOIN . "', '" . GROUP_READ_WIKI .
|
||||
"', " . UP_DOWN_VOTING_GROUP . ", " . FOREVER . ")";
|
||||
$db->execute($sql);
|
||||
$now = time();
|
||||
$db->execute("INSERT INTO USER_GROUP VALUES (" . ROOT_ID . ", " .
|
||||
HELP_GROUP_ID . ", " . ACTIVE_STATUS . ", $now)");
|
||||
$db->execute("INSERT INTO USER_GROUP VALUES (" . PUBLIC_USER_ID . ", " .
|
||||
HELP_GROUP_ID . ", " . ACTIVE_STATUS . ", $now)");
|
||||
$group_model = new GroupModel(DB_NAME, false);
|
||||
$group_model->db = $db;
|
||||
// Insert Default Public Wiki Pages
|
||||
if (file_exists(APP_DIR . "/configs/PublicHelpPages.php")) {
|
||||
require_once APP_DIR."/configs/PublicHelpPages.php";
|
||||
} else {
|
||||
require_once BASE_DIR."/configs/PublicHelpPages.php";
|
||||
}
|
||||
$default_locale = L\getLocaleTag();
|
||||
foreach ($public_pages as $locale_tag => $locale_pages) {
|
||||
L\setLocaleObject($locale_tag);
|
||||
foreach ($locale_pages as $page_name => $page_content) {
|
||||
$page_name = str_replace(" ", "_", $page_name);
|
||||
$page_content = str_replace("'", "'", $page_content);
|
||||
$group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, $page_name,
|
||||
$page_content, $locale_tag, "",
|
||||
L\tl('social_component_page_created', $page_name),
|
||||
L\tl('social_component_page_discuss_here'));
|
||||
}
|
||||
}
|
||||
//Insert Default Public Help pages
|
||||
foreach ($help_pages as $locale_tag => $locale_pages) {
|
||||
L\setLocaleObject($locale_tag);
|
||||
foreach ($locale_pages as $page_name => $page_content) {
|
||||
$page_name = str_replace(" ", "_", $page_name);
|
||||
$page_content = str_replace("'", "'", $page_content);
|
||||
$group_model->setPageName(ROOT_ID, HELP_GROUP_ID, $page_name,
|
||||
$page_content, $locale_tag, "",
|
||||
L\tl('social_component_page_created', $page_name),
|
||||
L\tl('social_component_page_discuss_here'));
|
||||
}
|
||||
}
|
||||
L\setLocaleObject($default_locale);
|
||||
/* End Help content insertion. */
|
||||
/*
|
||||
Set up generic page relationship
|
||||
*/
|
||||
$db->execute("INSERT INTO PAGE_RELATIONSHIP VALUES (-1, 'generic_links')");
|
||||
/* we insert 1 by 1 rather than comma separate as sqlite
|
||||
does not support comma separated inserts
|
||||
*/
|
||||
$locales = [
|
||||
['en-US', 'English', 'lr-tb'],
|
||||
['ar', 'العربية', 'rl-tb'],
|
||||
['de', 'Deutsch', 'lr-tb'],
|
||||
['es', 'Español', 'lr-tb'],
|
||||
['fr-FR', 'Français', 'lr-tb'],
|
||||
['he', 'עברית', 'rl-tb'],
|
||||
['in-ID', 'Bahasa', 'lr-tb'],
|
||||
['it', 'Italiano', 'lr-tb'],
|
||||
['ja', '日本語', 'lr-tb'],
|
||||
['ko', '한국어', 'lr-tb'],
|
||||
['nl', 'Nederlands', 'lr-tb'],
|
||||
['pl', 'Polski', 'lr-tb'],
|
||||
['pt', 'Português', 'lr-tb'],
|
||||
['ru', 'Русский', 'lr-tb'],
|
||||
['th', 'ไทย', 'lr-tb'],
|
||||
['vi-VN', 'Tiếng Việt', 'lr-tb'],
|
||||
['zh-CN', '中文', 'lr-tb'],
|
||||
['kn', 'ಕನ್ನಡ', 'lr-tb'],
|
||||
['hi', 'हिन्दी', 'lr-tb'],
|
||||
['tr', 'Türkçe', 'lr-tb'],
|
||||
['fa', 'فارسی', 'rl-tb'],
|
||||
['te', 'తెలుగు', 'lr-tb'],
|
||||
];
|
||||
$i = 1;
|
||||
foreach ($locales as $locale) {
|
||||
$db->execute("INSERT INTO LOCALE VALUES ($i, '{$locale[0]}',
|
||||
'{$locale[1]}', '{$locale[2]}', '1')");
|
||||
$locale_index[$locale[0]] = $i;
|
||||
$i++;
|
||||
}
|
||||
$activities = [
|
||||
"manageAccount" => ['db_activity_manage_account',
|
||||
[
|
||||
"en-US" => 'Manage Account',
|
||||
"fa" => 'مدیریت حساب',
|
||||
"fr-FR" => 'Modifier votre compte',
|
||||
"ja" => 'アカウント管理',
|
||||
"ko" => '사용자 계정 관리',
|
||||
"nl" => 'Account Beheren',
|
||||
"vi-VN" => 'Quản lý tài khoản',
|
||||
"zh-CN" => '管理帳號',
|
||||
]],
|
||||
"manageUsers" => ['db_activity_manage_users',
|
||||
[
|
||||
"en-US" => 'Manage Users',
|
||||
"fa" => 'مدیریت کاربران',
|
||||
"fr-FR" => 'Modifier les utilisateurs',
|
||||
"ja" => 'ユーザー管理',
|
||||
"ko" => '사용자 관리',
|
||||
"nl" => 'Gebruikers beheren',
|
||||
"vi-VN" => 'Quản lý tên sử dụng',
|
||||
"zh-CN" => '管理使用者',
|
||||
]],
|
||||
"manageRoles" => ['db_activity_manage_roles',
|
||||
[
|
||||
"en-US" => 'Manage Roles',
|
||||
"fa" => 'مدیریت نقشها',
|
||||
"fr-FR" => 'Modifier les rôles',
|
||||
"ja" => '役割管理',
|
||||
"ko" => '사용자 권한 관리',
|
||||
"nl" => 'Rollen beheren',
|
||||
"vi-VN" => 'Quản lý chức vụ',
|
||||
]],
|
||||
"manageGroups" => ['db_activity_manage_groups',
|
||||
[
|
||||
"en-US" => 'Manage Groups',
|
||||
"fr-FR" => 'Modifier les groupes',
|
||||
"nl" => 'Groepen beheren',
|
||||
]],
|
||||
"manageCrawls" => ['db_activity_manage_crawl',
|
||||
[
|
||||
"en-US" => 'Manage Crawls',
|
||||
"fa" => 'مدیریت خزشها',
|
||||
"fr-FR" => 'Modifier les indexes',
|
||||
"ja" => '検索管理',
|
||||
"ko" => '크롤 관리',
|
||||
"nl" => 'Beheer Crawls',
|
||||
"vi-VN" => 'Quản lý sự bò',
|
||||
]],
|
||||
"groupFeeds" => ['db_activity_group_feeds',
|
||||
[
|
||||
"en-US" => 'Feeds and Wikis',
|
||||
"nl" => 'Feeds en Wikis',
|
||||
]],
|
||||
"mixCrawls" => ['db_activity_mix_crawls',
|
||||
[
|
||||
"en-US" => 'Mix Crawls',
|
||||
"fa" => 'ترکیبهای خزشها',
|
||||
"fr-FR" => 'Mélanger les indexes',
|
||||
"nl" => 'Mix Crawls',
|
||||
]],
|
||||
"manageClassifiers" => ['db_activity_manage_classifiers',
|
||||
[
|
||||
"en-US" => 'Manage Classifiers',
|
||||
"fa" => '',
|
||||
"fr-FR" => 'Classificateurs',
|
||||
"nl" => 'Beheer Classifiers',
|
||||
]],
|
||||
"pageOptions" => ['db_activity_file_options',
|
||||
[
|
||||
"en-US" => 'Page Options',
|
||||
"fa" => 'تنظیمات صفحه',
|
||||
"fr-FR" => 'Options de fichier',
|
||||
"nl" => 'Opties voor de pagina',
|
||||
]],
|
||||
"resultsEditor" => ['db_activity_results_editor',
|
||||
[
|
||||
"en-US" => 'Results Editor',
|
||||
"fa" => 'ویرایشگر نتایج',
|
||||
"fr-FR" => 'Éditeur de résultats',
|
||||
"nl" => 'Resultaten Editor',
|
||||
]],
|
||||
"searchSources" => ['db_activity_search_services',
|
||||
[
|
||||
"en-US" => 'Search Sources',
|
||||
"fa" => 'منابع جستجو',
|
||||
"fr-FR" => 'Sources de recherche',
|
||||
"nl" => 'Zoek Bronnen',
|
||||
]],
|
||||
"manageMachines" => ['db_activity_manage_machines',
|
||||
[
|
||||
"en-US" => 'Manage Machines',
|
||||
"fa" => 'مدیریت دستگاهها',
|
||||
"fr-FR" => 'Modifier les ordinateurs',
|
||||
"nl" => 'Beheer Machines',
|
||||
]],
|
||||
"manageLocales" => ['db_activity_manage_locales',
|
||||
[
|
||||
"en-US" => 'Manage Locales',
|
||||
"fa" => 'مدیریت زبانها',
|
||||
"fr-FR" => 'Modifier les lieux',
|
||||
"ja" => 'ローケル管理',
|
||||
"ko" => '로케일 관리',
|
||||
"nl" => 'Beheer varianten',
|
||||
"vi-VN" => 'Quản lý miền địa phương',
|
||||
]],
|
||||
"serverSettings" => ['db_activity_server_settings',
|
||||
[
|
||||
"en-US" => 'Server Settings',
|
||||
"fr-FR" => 'Serveurs',
|
||||
"nl" => 'Server Settings',
|
||||
]],
|
||||
"security" => ['db_activity_security',
|
||||
[
|
||||
"en-US" => 'Security',
|
||||
"fr-FR" => 'Sécurité',
|
||||
"nl" => 'Veiligheid',
|
||||
]],
|
||||
"appearance" => ['db_activity_appearance',
|
||||
[
|
||||
"en-US" => 'Appearance',
|
||||
"fr-FR" => 'Aspect',
|
||||
"nl" => 'Verschijning',
|
||||
]],
|
||||
"configure" => ['db_activity_configure',
|
||||
[
|
||||
"en-US" => 'Configure',
|
||||
"fa" => 'پیکربندی',
|
||||
"fr-FR" => 'Configurer',
|
||||
"ja" => '設定',
|
||||
"ko" => '구성',
|
||||
"nl" => 'Configureren',
|
||||
"vi-VN" => 'Sắp xếp hoạt động dựa theo hoạch định',
|
||||
]],
|
||||
"manageCredits" => ['db_activity_manage_credits',
|
||||
[
|
||||
"en-US" => 'Manage Credits',
|
||||
]],
|
||||
"manageAdvertisements" => ['db_activity_manage_advertisements',
|
||||
[
|
||||
"en-US" => 'Manage Advertisements',
|
||||
]],
|
||||
"scrapers" => ['db_activity_scrapers',
|
||||
[
|
||||
"en-US" => 'Web Scrapers',
|
||||
]]
|
||||
];
|
||||
$i = 1;
|
||||
foreach ($activities as $activity => $translation_info) {
|
||||
// set-up activity
|
||||
$db->execute("INSERT INTO ACTIVITY VALUES ($i, $i, '$activity')");
|
||||
//give admin role the ability to have that activity (except ads)
|
||||
if (in_array($activity, ["manageCredits", "manageAdvertisements"] )) {
|
||||
$db->execute("INSERT INTO ROLE_ACTIVITY VALUES (" .
|
||||
BUSINESS_ROLE . ", $i)");
|
||||
} else {
|
||||
$db->execute("INSERT INTO ROLE_ACTIVITY VALUES (" .
|
||||
ADMIN_ROLE . ", $i)");
|
||||
}
|
||||
$db->execute("INSERT INTO TRANSLATION
|
||||
VALUES($i, '{$translation_info[0]}')");
|
||||
foreach ($translation_info[1] as $locale_tag => $translation) {
|
||||
$index = $locale_index[$locale_tag];
|
||||
$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES ($i, $index,
|
||||
'$translation')");
|
||||
}
|
||||
$i++;
|
||||
}
|
||||
$new_user_activities = [
|
||||
"manageAccount",
|
||||
"manageGroups",
|
||||
"mixCrawls",
|
||||
"groupFeeds"
|
||||
];
|
||||
foreach ($new_user_activities as $new_activity) {
|
||||
$i = 1;
|
||||
foreach ($activities as $key => $value) {
|
||||
if ($new_activity == $key){
|
||||
//give new user role the ability to have that activity
|
||||
$db->execute("INSERT INTO ROLE_ACTIVITY VALUES (".
|
||||
USER_ROLE . ", $i)");
|
||||
}
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634195',
|
||||
'YouTube', 'video', 'http://www.youtube.com/watch?v={}',
|
||||
'http://i1.ytimg.com/vi/{}/default.jpg', '')");
|
||||
$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634196',
|
||||
'MetaCafe', 'video', 'http://www.metacafe.com/watch/{}',
|
||||
'http://www.metacafe.com/thumb/{}.jpg', '')");
|
||||
$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634197',
|
||||
'DailyMotion', 'video', 'http://www.dailymotion.com/video/{}',
|
||||
'http://www.dailymotion.com/thumbnail/video/{}', '')");
|
||||
$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634198',
|
||||
'Vimeo', 'video', 'http://player.vimeo.com/video/{}',
|
||||
'http://www.yioop.com/resources/blank.png?{}', '')");
|
||||
$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634199',
|
||||
'Break.com', 'video', 'http://www.break.com/index/{}', '" .
|
||||
NAME_SERVER . "/resources/blank.png?{}', '')");
|
||||
$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634200',
|
||||
'Yahoo News', 'rss', 'http://news.yahoo.com/rss/',
|
||||
'//content/@url', 'en-US')");
|
||||
$db->execute("INSERT INTO CRAWL_MIXES VALUES (2, 'images', ".ROOT_ID.", -1)");
|
||||
$db->execute("INSERT INTO MIX_FRAGMENTS VALUES(2, 0, 1)");
|
||||
$db->execute("INSERT INTO MIX_COMPONENTS VALUES(
|
||||
2, 0, 1, 1, 'media:image site:doc')");
|
||||
$db->execute("INSERT INTO CRAWL_MIXES VALUES (3, 'videos', ".ROOT_ID.", -1)");
|
||||
$db->execute("INSERT INTO MIX_FRAGMENTS VALUES(3, 0, 1)");
|
||||
$db->execute("INSERT INTO MIX_COMPONENTS VALUES(
|
||||
3, 0, 1, 1, 'media:video site:doc')");
|
||||
$db->execute("INSERT INTO CRAWL_MIXES VALUES (4, 'news', ".ROOT_ID.", -1)");
|
||||
$db->execute("INSERT INTO MIX_FRAGMENTS VALUES(4, 0, 1)");
|
||||
$db->execute("INSERT INTO MIX_COMPONENTS VALUES(4, 0, 1, 1,
|
||||
'media:news')");
|
||||
$db->execute("INSERT INTO SUBSEARCH VALUES('db_subsearch_images',
|
||||
'images','m:2', 50)");
|
||||
$db->execute("INSERT INTO TRANSLATION VALUES (1002,'db_subsearch_images')");
|
||||
$db->execute("INSERT INTO SUBSEARCH VALUES ('db_subsearch_videos',
|
||||
'videos','m:3', 10)");
|
||||
$db->execute("INSERT INTO TRANSLATION VALUES (1003,'db_subsearch_videos')");
|
||||
$db->execute("INSERT INTO SUBSEARCH VALUES ('db_subsearch_news',
|
||||
'news','m:4',20)");
|
||||
$db->execute("INSERT INTO TRANSLATION VALUES (1004,'db_subsearch_news')");
|
||||
$sql = "INSERT INTO SCRAPER(NAME, SIGNATURE, SCRAPE_RULES) VALUES (?,?,?)";
|
||||
$scrapers = [
|
||||
["YIOOP", "/html/head/*[contains(@href,".
|
||||
"'c=resource&a=get&f=css&n=auxiliary.css')]",
|
||||
"//div[contains(@class, 'body-container')]###" .
|
||||
"//*[contains(@id, 'message')]###//*[contains(@id, 'help')]###" .
|
||||
"//*[contains(@id, 'MathJax')]###" .
|
||||
"//*[contains(@class, 'component-container')]###" .
|
||||
"//*[contains(@class, 'top-bar')]###".
|
||||
"//*[contains(@class, 'query-statistics')]###" .
|
||||
"//*[contains(@class, 'admin-collapse')]###" .
|
||||
"//option[not(contains(@selected, 'selected'))]###" .
|
||||
"//*[contains(@id, 'suggest')]###//*[contains(@id, 'spell')]"],
|
||||
["DRUPAL", "/html/head/*[contains(@href, '/sites/all/themes') or " .
|
||||
"contains(@href, '/sites/default/files') or ".
|
||||
"contains(@content, 'Drupal')]",
|
||||
"//div[@id='page']|//main" .
|
||||
"###//*[contains(@id,'comments')]" .
|
||||
"###//*[contains(@id,'respond')]" .
|
||||
"###//*[contains(@class,'bottomcontainerBox')]" .
|
||||
"###//*[contains(@class,'post-by')]" .
|
||||
"###//*[contains(@class,'entry meta-clear')]"],
|
||||
["MEDIAWIKI", "//meta[contains(@content, 'MediaWiki')]",
|
||||
"//*[contains(@id, 'mw-content-text')]###".
|
||||
"//*[contains(@class, 'nmbox')]###//*[contains(@class, 'hatnote')]###".
|
||||
"//*[contains(@class, 'infobox')]"],
|
||||
["VBULLETIN", "/html/head/*[contains(@href,'vbulletin')]",
|
||||
"//div[contains(@class, 'body_wrapper')]###" .
|
||||
"//*[contains(@id, 'above')]###//*[contains(@id, 'below')]###" .
|
||||
"//*[contains(@id, 'breadcrumb')]###//*[contains(@id, 'notices')]###" .
|
||||
"//*[contains(@id, 'footer')]###".
|
||||
"//*[contains(@id, 'forum_info_options')]###" ."
|
||||
//*[contains(@class, 'threadlisthead')]###" ."
|
||||
//*[contains(@class, 'threaddetails')]###".
|
||||
"//*[contains(@id, 'pagination')]###".
|
||||
"//*[contains(@class, 'threadstats')]###".
|
||||
"//*[contains(@class, 'threadlastpost')]###".
|
||||
"//span[contains(@class, 'label')]"],
|
||||
["WORDPRESS", "/html/head/*[contains(@href, 'wp-content')".
|
||||
" or contains(@href, 'wp-includes')]",
|
||||
"//div[starts-with(@id, 'post-') and " .
|
||||
"'post-' = translate(@id, '0123456789', '') and " .
|
||||
"string-length(@id) >4]|//div[contains(@class, 'homepagewrapper')]###".
|
||||
"//*[contains(@id, 'entry-comments')]###" .
|
||||
"//*[contains(@class, 'sharedaddy')]###" .
|
||||
"//*[contains(@class, 'blog-subscribe')]###".
|
||||
"//*[contains(@id, 'entry-side')]"]
|
||||
];
|
||||
foreach ($scrapers as $scraper) {
|
||||
$db->execute($sql, $scraper);
|
||||
}
|
||||
$subsearch_translations = [
|
||||
'db_subsearch_images' => [
|
||||
'en-US' => 'Images',
|
||||
'ar' => 'لصور',
|
||||
'fa' => 'تصاوی',
|
||||
'fr-FR' => 'Images',
|
||||
'nl' => 'Beelden',
|
||||
'vi-VN' => 'Hình',
|
||||
'zh-CN' => '图象'
|
||||
],
|
||||
'db_subsearch_videos' => [
|
||||
'en-US' => 'Videos',
|
||||
'ar' => 'فيدي',
|
||||
'fa' => 'ویدیوها',
|
||||
'fr-FR' => 'Vidéos',
|
||||
'nl' => 'Videos',
|
||||
'vi-VN' => 'Thâu hình',
|
||||
'zh-CN' => '录影'
|
||||
],
|
||||
'db_subsearch_news' => [
|
||||
'en-US' => 'News',
|
||||
'ar' => 'أخبار',
|
||||
'fa' => 'اخبا',
|
||||
'fr-FR' => 'Actualités',
|
||||
'nl' => 'Nieuws',
|
||||
'vi-VN' => 'Tin tức',
|
||||
'zh-CN' => '新闻'
|
||||
]
|
||||
];
|
||||
foreach ($subsearch_translations as $identifier => $locale_translations) {
|
||||
foreach ($locale_translations as $locale_tag => $translation) {
|
||||
L\updateTranslationForStringId($db, $identifier, $locale_tag,
|
||||
$translation);
|
||||
}
|
||||
}
|
||||
if (stristr(DB_HOST, "pgsql") !== false) {
|
||||
/* For postgres count initial values of SERIAL sequences
|
||||
will be screwed up unless do
|
||||
*/
|
||||
$auto_tables = ["ACTIVITY" =>"ACTIVITY_ID",
|
||||
"GROUP_ITEM" =>"GROUP_ITEM_ID", "GROUP_PAGE" => "GROUP_PAGE_ID",
|
||||
"GROUPS" => "GROUP_ID", "LOCALE"=> "LOCALE_ID", "ROLE" => "ROLE_ID",
|
||||
"TRANSLATION" => "TRANSLATION_ID", "USERS" => "USER_ID"];
|
||||
foreach ($auto_tables as $table => $auto_column) {
|
||||
$sql = "SELECT MAX($auto_column) AS NUM FROM $table";
|
||||
$result = $db->execute($sql);
|
||||
$row = $db->fetchArray($result);
|
||||
$next = $row['NUM'];
|
||||
$sequence = strtolower("{$table}_{$auto_column}_seq");
|
||||
$sql = "SELECT setval('$sequence', $next)";
|
||||
$db->execute($sql);
|
||||
$sql = "SELECT nextval('$sequence')";
|
||||
$db->execute($sql);
|
||||
}
|
||||
}
|
||||
|
||||
$db->disconnect();
|
||||
if (in_array(DBMS, ['sqlite','sqlite3'])){
|
||||
chmod(CRAWL_DIR."/data/".DB_NAME.".db", 0666);
|
||||
}
|
||||
echo "Create DB succeeded\n";
|
69
src/configs/CreditConfig.php
Normal file
69
src/configs/CreditConfig.php
Normal file
|
@ -0,0 +1,69 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop -- Credit Card Configuration
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
* All rights reserved
|
||||
*/
|
||||
namespace seekquarry\yioop\configs;
|
||||
|
||||
/**
|
||||
* Class containing methods used to handle payment processing when keyword
|
||||
* advertising is enabled.
|
||||
*
|
||||
* This class is a "blank" implementation that does not charge credit cards
|
||||
* An implementation that uses stripe.com for payment processing can be
|
||||
* obtained from seekquarry.com. Putting that implementation in the
|
||||
* APP_DIR/configs/ folder would that enable real credit card processing in
|
||||
* Yioop
|
||||
*/
|
||||
class CreditConfig
|
||||
{
|
||||
/**
|
||||
* Returns whether a version of CreditConfig actually capable of charging
|
||||
* cards, receiving bitcoins, etc is in use.
|
||||
*
|
||||
* @return bool whether a real credit card processing class is use
|
||||
*/
|
||||
public static function isActive()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Returns the URL to the credit processing Javascript library
|
||||
* responsible for sending securely the credit card details to the
|
||||
* credit payment agency
|
||||
* (for example, stripe.com) then sending along a authorization token
|
||||
* as part of the form to the Yioop backend
|
||||
* @return string
|
||||
*/
|
||||
public static function getCreditTokenUrl()
|
||||
{
|
||||
return "";
|
||||
}
|
||||
/**
|
||||
* Used to get field values from input tag with attribute name set to $name
|
||||
* and attribute value set to value
|
||||
* @param string $name of attribute (usually data-)
|
||||
* @param string $value value of attribute
|
||||
* @return string field value of the correspond input tag
|
||||
*/
|
||||
public static function getAttribute($name, $value)
|
||||
{
|
||||
return "data-ignore";
|
||||
}
|
||||
/**
|
||||
* Server side method that is actually responsible for charging the
|
||||
* credit card
|
||||
*
|
||||
* @param float $amount dollar amount to charge the card
|
||||
* @param string $token token issued for transaction from the card
|
||||
* processing agency
|
||||
* @param string& $message message to use as for reason for charge
|
||||
* @return bool whether or not the charge was successful
|
||||
*/
|
||||
public static function charge($amount, $token, &$message)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
147
src/configs/ExportPublicHelpDb.php
Normal file
147
src/configs/ExportPublicHelpDb.php
Normal file
|
@ -0,0 +1,147 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* This script can be used to export the public and help wiki pages for
|
||||
* Yioop system to the file public_help_pages.php . This page is then
|
||||
* used by createdb.php when creating a fresh version of the Yioop database.
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\configs;
|
||||
|
||||
use seekquarry\yioop\library as L;
|
||||
|
||||
if (!empty($_SERVER['DOCUMENT_ROOT'])) {
|
||||
echo "BAD REQUEST";
|
||||
exit();
|
||||
}
|
||||
/** For crawlHash function */
|
||||
require_once __DIR__."/../library/Utility.php";
|
||||
$db_class = NS_DATASOURCES . ucfirst(DBMS)."Manager";
|
||||
$dbinfo = ["DBMS" => DBMS, "DB_HOST" => DB_HOST, "DB_USER" => DB_USER,
|
||||
"DB_PASSWORD" => DB_PASSWORD, "DB_NAME" => DB_NAME];
|
||||
$db = new $db_class();
|
||||
if (!in_array(DBMS, ['sqlite', 'sqlite3'])) {
|
||||
$db->connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME);
|
||||
} else {
|
||||
$db->connect();
|
||||
}
|
||||
$sql = "SELECT GPH.TITLE AS TITLE, GPH.PAGE AS PAGE, ".
|
||||
" GPH.LOCALE_TAG AS LOCALE_TAG FROM GROUP_PAGE_HISTORY GPH WHERE ".
|
||||
" GPH.GROUP_ID='".PUBLIC_GROUP_ID."' AND GPH.LOCALE_TAG <> '' AND ".
|
||||
" NOT EXISTS (SELECT * FROM GROUP_PAGE_HISTORY GP WHERE ".
|
||||
" GPH.PAGE_ID=GP.PAGE_ID AND ".
|
||||
" GPH.PUBDATE < GP.PUBDATE) ORDER BY GPH.LOCALE_TAG, GPH.TITLE";
|
||||
$result = $db->execute($sql);
|
||||
$app_config_dir = APP_DIR . "/configs";
|
||||
if (!file_exists($app_config_dir)) {
|
||||
L\crawlLog("$app_config_dir does not exists, trying to make it...\n");
|
||||
if (!mkdir($app_config_dir)) {
|
||||
L\crawlLog("Make $app_config_dir failed, quitting");
|
||||
exit();
|
||||
}
|
||||
}
|
||||
$out_file = "$app_config_dir/PublicHelpPages.php";
|
||||
$out = "<"."?php\n";
|
||||
$out .= <<< EOD
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Default Public Wiki Pages
|
||||
*
|
||||
* This file should be generated using ExportPublicHelpDb.php
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
EOD;
|
||||
$out .= "\nnamespace " . substr(NS_CONFIGS, 0, -1) . ";\n\n";
|
||||
$out .= <<< EOD
|
||||
/**
|
||||
* Public wiki pages
|
||||
* @var array
|
||||
*/
|
||||
|
||||
EOD;
|
||||
$out .= '$public_pages = [];'."\n";
|
||||
if ($result) {
|
||||
while($row = $db->fetchArray($result)) {
|
||||
$out .= '$public_pages["' . $row['LOCALE_TAG'] . '"]["' .
|
||||
$row['TITLE'] . '"] = <<< '."'EOD'\n";
|
||||
$out .= $row['PAGE'] ."\nEOD;\n";
|
||||
}
|
||||
}
|
||||
$out .= "//\n// Default Help Wiki Pages\n//\n";
|
||||
$sql = "SELECT GPH.TITLE AS TITLE, GPH.PAGE AS PAGE, ".
|
||||
" GPH.LOCALE_TAG AS LOCALE_TAG FROM GROUP_PAGE_HISTORY GPH WHERE ".
|
||||
" GPH.GROUP_ID='".HELP_GROUP_ID."' AND GPH.LOCALE_TAG <> '' AND ".
|
||||
" NOT EXISTS (SELECT * FROM GROUP_PAGE_HISTORY GP WHERE ".
|
||||
" GPH.PAGE_ID=GP.PAGE_ID AND ".
|
||||
" GPH.PUBDATE < GP.PUBDATE) ORDER BY GPH.LOCALE_TAG, GPH.TITLE";
|
||||
$result = $db->execute($sql);
|
||||
$out .= <<< EOD
|
||||
/**
|
||||
* Help wiki pages
|
||||
* @var array
|
||||
*/
|
||||
EOD;
|
||||
$out .= '$help_pages = [];'."\n";
|
||||
if ($result) {
|
||||
while($row = $db->fetchArray($result)) {
|
||||
$out .= '$help_pages["' . $row['LOCALE_TAG'] . '"]["' .
|
||||
$row['TITLE'] . '"] = <<< '."EOD\n";
|
||||
$out .= $row['PAGE'] ."\nEOD;\n";
|
||||
}
|
||||
}
|
||||
$out .= "\n";
|
||||
file_put_contents($out_file, $out);
|
||||
L\crawlLog("Wrote export data to $out_file");
|
||||
|
2301
src/configs/PublicHelpPages.php
Normal file
2301
src/configs/PublicHelpPages.php
Normal file
File diff suppressed because it is too large
Load diff
292
src/configs/TokenTool.php
Normal file
292
src/configs/TokenTool.php
Normal file
|
@ -0,0 +1,292 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* TokenTool is used to create suggest word dictionaries and 'n' word gram
|
||||
* filter files for the Yioop! search engine.
|
||||
*
|
||||
* A description of its usage is given in the $usage global variable
|
||||
*
|
||||
*
|
||||
* @author Ravi Dhillon ravi.dhillon@yahoo.com, Chris Pollett (modified for n
|
||||
* ngrams)
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
|
||||
namespace seekquarry\yioop\configs;
|
||||
|
||||
use seekquarry\yioop\library\NWordGrams;
|
||||
use seekquarry\yioop\library\Trie;
|
||||
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
ini_set("memory_limit","1500M");
|
||||
/** Load in global configuration settings */
|
||||
require_once 'Config.php';
|
||||
/**
|
||||
* Used to print out a description of how to use TokenTool.php
|
||||
* @var string
|
||||
*/
|
||||
$usage = <<<EOD
|
||||
TokenTool.php
|
||||
==============
|
||||
|
||||
Usage
|
||||
=====
|
||||
TokenTool is used to create suggest word dictionaries,
|
||||
segment and 'n' word gram filter files for the Yioop! search engine.
|
||||
To create either of these items, the user
|
||||
puts a source file in Yioop's WORK_DIRECTORY/prepare folder. Suggest word
|
||||
dictionaries are used to supply the content of the dropdown of search terms
|
||||
that appears as a user is entering a query in Yioop. To make a suggest
|
||||
dictionary one can use a command like:
|
||||
|
||||
php TokenTool.php dictionary filename locale endmarker
|
||||
|
||||
Here filename should be in the current folder or PREP_DIR and should consist
|
||||
of one word per line, locale is the locale this suggest (for example, en-US)
|
||||
file is being made for and where a file suggest-trie.txt.gz will be written,
|
||||
and endmarker is the end of word symbol to use in the trie. For example,
|
||||
$ works pretty well.
|
||||
|
||||
TokenTool.php can also be used to make filter files. A filter file is used to
|
||||
detect when words in a language should be treated as a unit when extracting text
|
||||
during a crawl and at search time. For example, Bill Clinton is 2 word gram
|
||||
which should be treated as unit because it is a particular person. These
|
||||
filter files can also be used with a segmenter which
|
||||
might be used to split Chinese or Japanese text which does not have spaces into
|
||||
a sequence of Chinese and Japanese words (which may be made out of multiple
|
||||
characters). For a nonsegmenter filter, TokenTool.php is run from the
|
||||
command line as:
|
||||
|
||||
php TokenTool.php filter wiki_file lang locale n extract_type max_to_extract
|
||||
|
||||
where file is a wikipedia xml file or is a bz2 compressed xml file whose urls
|
||||
or wiki page count dump file (it can also be a folder of these kind of files)
|
||||
used to determine the n-grams,
|
||||
lang is an Wikipedia language tag (ignored in segmenter case),
|
||||
locale is the IANA language tag of the locale to store the results for
|
||||
(if different from lang, for example, en-US versus en for lang), n is the
|
||||
number of words in a row to consider , extract_type is where from Wikipedia
|
||||
source to extract:
|
||||
|
||||
0 = title's,
|
||||
1 = redirect's,
|
||||
2 = page count dump wikipedia data,
|
||||
3 = page count dump wiktionary data.
|
||||
|
||||
For a segmenter filter, TokenTool.php is run from the
|
||||
command line as:
|
||||
|
||||
php TokenTool.php segment-filter dictionary_file locale
|
||||
|
||||
Here dictionary_file should be a text file with one word/line,
|
||||
locale is the IANA language tag of the locale to store the results for.
|
||||
|
||||
|
||||
Obtaining Data
|
||||
==============
|
||||
Many word lists are obtainable on the web for free with Creative Commons
|
||||
licenses. A good starting point is:
|
||||
http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists
|
||||
A little script-fu can generally take such a list and put it into the
|
||||
format of one word/term per line which is needed by TokenTool.php
|
||||
|
||||
For filter file, Raw page count dumps can be found at
|
||||
http://dumps.wikimedia.org/other/pagecounts-raw/
|
||||
These probably give the best n-gram or all gram results, usually
|
||||
in a matter of minutes; nevertheless, this tool does support trying to extract
|
||||
similar data from Wikipedia dumps. This can take hours.
|
||||
|
||||
For Wikipedia dumps, one can go to http://dumps.wikimedia.org/enwiki/
|
||||
and obtain a dump of the English Wikipedia (similar for other languages).
|
||||
This page lists all the dumps according to date they were taken. Choose any
|
||||
suitable date or the latest. A link with a label such as 20120104/, represents
|
||||
a dump taken on 01/04/2012. Click this link to go in turn to a page which has
|
||||
many links based on type of content you are looking for. For
|
||||
this tool you are interested in files under
|
||||
|
||||
"Recombine all pages, current versions only".
|
||||
|
||||
Beneath this we might find a link with a name like:
|
||||
enwiki-20120104-pages-meta-current.xml.bz2
|
||||
which is a file that could be processed by this tool.
|
||||
|
||||
A Creative Commons licensed file which can be manipulated into a dictionary
|
||||
file suitable for Chinese segmentation can be found at:
|
||||
http://www.mdbg.net/chindict/chindict.php?page=cc-cedict
|
||||
|
||||
EOD;
|
||||
$num_args = count($argv);
|
||||
if ( $num_args < 3 || $num_args > 8) {
|
||||
echo $usage;
|
||||
exit();
|
||||
}
|
||||
switch ($argv[1]) {
|
||||
case "dictionary":
|
||||
if (!isset($argv[3])) {
|
||||
$argv[3] = "en-US";
|
||||
}
|
||||
if (!isset($argv[4])) {
|
||||
$argv[4] = "$";
|
||||
}
|
||||
makeSuggestTrie($argv[2], $argv[3], $argv[4]);
|
||||
break;
|
||||
case "filter":
|
||||
array_shift($argv);
|
||||
array_shift($argv);
|
||||
makeNWordGramsFiles($argv);
|
||||
break;
|
||||
case "segment-filter":
|
||||
$file_path = PREP_DIR."/";
|
||||
if (!file_exists($file_path.$argv[2])) {
|
||||
echo $argv[2]." does not exist in ".$file_path;
|
||||
exit();
|
||||
}
|
||||
NWordGrams::makeSegmentFilterFile($file_path.$argv[2], $argv[3]);
|
||||
break;
|
||||
default:
|
||||
echo $usage;
|
||||
exit();
|
||||
}
|
||||
if (!PROFILE) {
|
||||
echo "Please configure the search engine instance ".
|
||||
"by visiting its web interface on localhost.\n";
|
||||
exit();
|
||||
}
|
||||
/**
|
||||
* Makes an n or all word gram Bloom filter based on the supplied arguments
|
||||
* Wikipedia files are assumed to have been place in the PREP_DIR before this
|
||||
* is run and writes it into the resources folder of the given locale
|
||||
*
|
||||
* @param array $args command line arguments with first two elements of $argv
|
||||
* removed. For details on which arguments do what see the $usage variable
|
||||
*/
|
||||
function makeNWordGramsFiles($args)
|
||||
{
|
||||
if (!isset($args[1])) {
|
||||
$args[1] = "en";
|
||||
$args[2] = "en-US";
|
||||
}
|
||||
if (!isset($args[2])) {
|
||||
$args[2] = $args[1];
|
||||
}
|
||||
if (!isset($args[3])) {
|
||||
$args[3] = 2; // bigrams
|
||||
}
|
||||
if (!isset($argv[4])) {
|
||||
$args[4] = NWordGrams::PAGE_COUNT_WIKIPEDIA;
|
||||
}
|
||||
if (!isset($args[5]) && $args[3] == "all" &&
|
||||
$args[2] == NWordGrams::PAGE_COUNT_WIKIPEDIA) {
|
||||
$args[5] = 400000;
|
||||
} else {
|
||||
$args[5] = -1;
|
||||
}
|
||||
$wiki_file_path = PREP_DIR."/";
|
||||
if (!file_exists($wiki_file_path.$args[0])) {
|
||||
echo $args[0]." does not exist in $wiki_file_path";
|
||||
exit();
|
||||
}
|
||||
/*
|
||||
*This call creates a ngrams text file from input xml file and
|
||||
*returns the count of ngrams in the text file.
|
||||
*/
|
||||
list($num_ngrams, $max_gram_len) =
|
||||
NWordGrams::makeNWordGramsTextFile($args[0], $args[1], $args[2],
|
||||
$args[3], $args[4], $args[5]);
|
||||
|
||||
/*
|
||||
*This call creates a bloom filter file from n word grams text file based
|
||||
*on the language specified.The lang passed as parameter is prefixed
|
||||
*to the filter file name. The count of n word grams in text file is passed
|
||||
*as a parameter to set the limit of n word grams in the filter file.
|
||||
*/
|
||||
NWordGrams::makeNWordGramsFilterFile($args[2], $args[3], $num_ngrams,
|
||||
$max_gram_len);
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes a trie that can be used to make word suggestions as someone enters
|
||||
* terms into the Yioop! search box. Outputs the result into the file
|
||||
* suggest_trie.txt.gz in the supplied locale dir
|
||||
*
|
||||
* @param string $dict_file where the word list is stored, one word per line
|
||||
* @param string $locale which locale to write the suggest file to
|
||||
* @param string $end_marker used to indicate end of word in the trie
|
||||
*/
|
||||
function makeSuggestTrie($dict_file, $locale, $end_marker)
|
||||
{
|
||||
$locale = str_replace("-", "_", $locale);
|
||||
$out_file = LOCALE_DIR."/$locale/resources/suggest_trie.txt.gz";
|
||||
|
||||
// Read and load dictionary and stop word files
|
||||
$words = fileWithTrim($dict_file);
|
||||
sort($words);
|
||||
$trie = new Trie($end_marker);
|
||||
|
||||
/** Ignore the words in the following cases. If the word
|
||||
* - contains punctuation
|
||||
* - is less than 3 characters
|
||||
* - is a stop word
|
||||
*/
|
||||
foreach ($words as $word) {
|
||||
if (mb_ereg_match("\p{P}", $word) == 0 && mb_strlen($word) > 2) {
|
||||
$trie->add($word);
|
||||
}
|
||||
}
|
||||
$output = [];
|
||||
$output["trie_array"] = $trie->trie_array;
|
||||
$output["end_marker"] = $trie->end_marker;
|
||||
file_put_contents($out_file, gzencode(json_encode($output), 9));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads file into an array or outputs file not found. For each entry in
|
||||
* array trims it. Any blank lines are deleted
|
||||
*
|
||||
* @param $file_name file to read into array
|
||||
* @return array of trimmed lines
|
||||
*/
|
||||
function fileWithTrim($file_name)
|
||||
{
|
||||
if (!file_exists($file_name)) {
|
||||
$file_name = PREP_DIR."/$file_name";
|
||||
if (!file_exists($file_name)) {
|
||||
echo "$file_name Not Found\n\n";
|
||||
return [];
|
||||
}
|
||||
}
|
||||
$file_string = file_get_contents($file_name);
|
||||
$pre_lines = mb_split("\n", $file_string);
|
||||
$lines = [];
|
||||
foreach ($pre_lines as $pre_line) {
|
||||
$line = preg_replace( "/(^\s+)|(\s+$)/us", "", $pre_line );
|
||||
if ($line != "") {
|
||||
array_push($lines, $line);
|
||||
}
|
||||
}
|
||||
return $lines;
|
||||
}
|
180
src/configs/default_crawl.ini
Normal file
180
src/configs/default_crawl.ini
Normal file
|
@ -0,0 +1,180 @@
|
|||
; ***** BEGIN LICENSE BLOCK *****
|
||||
; SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
; Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
;
|
||||
; This program is free software: you can redistribute it and/or modify
|
||||
; it under the terms of the GNU General Public License as published by
|
||||
; the Free Software Foundation, either version 3 of the License, or
|
||||
; (at your option) any later version.
|
||||
;
|
||||
; This program is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; GNU General Public License for more details.
|
||||
;
|
||||
; You should have received a copy of the GNU General Public License
|
||||
; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
; ***** END LICENSE BLOCK *****
|
||||
;
|
||||
; default_crawl.ini
|
||||
;
|
||||
; This is an example of a crawl.ini configuration file. If you mess up
|
||||
; the crawl.ini you can simply delete it and this one will be used to recreate
|
||||
; it
|
||||
;
|
||||
[general]
|
||||
arc_dir = "";
|
||||
arc_type = "";
|
||||
crawl_order = 'ad';
|
||||
crawl_type = 'ax';
|
||||
page_range_request = '50000';
|
||||
page_recrawl_frequency = '-1';
|
||||
restrict_sites_by_url = false;
|
||||
summarizer_option = 'dl';
|
||||
max_description_len = '2000';
|
||||
|
||||
[indexed_file_types]
|
||||
extensions[] = 'unknown';
|
||||
extensions[] = 'bmp';
|
||||
extensions[] = 'doc';
|
||||
extensions[] = 'docx';
|
||||
extensions[] = 'csv';
|
||||
extensions[] = 'tab';
|
||||
extensions[] = 'tsv';
|
||||
extensions[] = 'txt';
|
||||
extensions[] = 'epub';
|
||||
extensions[] = 'asp';
|
||||
extensions[] = 'aspx';
|
||||
extensions[] = 'cgi';
|
||||
extensions[] = 'cfm';
|
||||
extensions[] = 'cfml';
|
||||
extensions[] = 'do';
|
||||
extensions[] = 'htm';
|
||||
extensions[] = 'html';
|
||||
extensions[] = 'jsp';
|
||||
extensions[] = 'php';
|
||||
extensions[] = 'pl';
|
||||
extensions[] = 'py';
|
||||
extensions[] = 'shtml';
|
||||
extensions[] = 'gif';
|
||||
extensions[] = 'xml';
|
||||
extensions[] = 'java';
|
||||
extensions[] = 'jpg';
|
||||
extensions[] = 'jpeg';
|
||||
extensions[] = 'pdf';
|
||||
extensions[] = 'png';
|
||||
extensions[] = 'ppt';
|
||||
extensions[] = 'pptx';
|
||||
extensions[] = 'py';
|
||||
extensions[] = 'rss';
|
||||
extensions[] = 'rtf';
|
||||
extensions[] = 'svg';
|
||||
extensions[] = 'xlsx';
|
||||
extensions[] = 'xml';
|
||||
|
||||
[allowed_sites]
|
||||
url[] = 'http://www.yahoo.com/';
|
||||
url[] = 'http://www.youtube.com/';
|
||||
url[] = 'http://www.google.com/';
|
||||
|
||||
[disallowed_sites]
|
||||
url[] = 'domain:arxiv.org';
|
||||
url[] = 'domain:ask.com';
|
||||
url[] = 'domain:yelp.com';
|
||||
url[] = 'domain:clixsense.com';
|
||||
|
||||
[seed_sites]
|
||||
url[] = 'http://www.ucanbuyart.com/';
|
||||
url[] = 'http://www.wikipedia.org/';
|
||||
url[] = 'http://www.dmoz.org/';
|
||||
url[] = 'http://www.yahoo.com/';
|
||||
url[] = 'http://www.google.com/';
|
||||
url[] = 'http://www.amazon.com/';
|
||||
url[] = 'http://www.bing.com/';
|
||||
url[] = 'http://www.facebook.com/';
|
||||
url[] = 'http://www.blogger.com/';
|
||||
url[] = 'http://www.myspace.com/';
|
||||
url[] = 'http://www.craigslist.org/';
|
||||
url[] = 'http://www.cnn.com/';
|
||||
url[] = 'http://www.about.com/';
|
||||
url[] = 'http://www.cnet.com/';
|
||||
url[] = 'http://www.adobe.com/';
|
||||
url[] = 'http://www.mozilla.com/';
|
||||
url[] = 'http://www.weather.com/';
|
||||
url[] = 'http://www.digg.com/';
|
||||
url[] = 'http://www.zynga.com/';;
|
||||
url[] = 'http://www.download.com/';
|
||||
url[] = 'http://www.ebay.com/';
|
||||
url[] = 'http://eccc.hpi-web.de/';
|
||||
url[] = 'http://citeseerx.ist.psu.edu/';
|
||||
url[] = 'http://www.archive.org/';
|
||||
url[] = 'http://www.imdb.com/';
|
||||
url[] = 'http://www.zillow.com/';
|
||||
url[] = 'http://www.wolframalpha.com/';
|
||||
url[] = 'http://www.youtube.com/';
|
||||
url[] = 'http://www.sourceforge.net/';
|
||||
url[] = 'http://www.huffingtonpost.com/';
|
||||
url[] = 'http://www.wikimedia.org/';
|
||||
url[] = 'http://www.reference.com/';
|
||||
url[] = 'http://www.comcast.net/';
|
||||
url[] = 'http://www.dell.com/';
|
||||
url[] = 'http://www.metacafe.com/';
|
||||
url[] = 'http://www.foxnews.com/';
|
||||
url[] = 'http://www.hp.com/';
|
||||
url[] = 'http://www.stumbleupon.com';
|
||||
url[] = 'http://www.twitter.com/';
|
||||
url[] = 'http://www.wordpress.org/';
|
||||
url[] = 'http://www.bankofamerica.com/';
|
||||
url[] = 'http://www.xing.com/';
|
||||
url[] = 'http://www.microsoft.com/';
|
||||
url[] = 'http://www.mybrowserbar.com/';
|
||||
url[] = 'http://www.guardian.co.uk/';
|
||||
url[] = 'http://www.skyrock.com/';
|
||||
url[] = 'http://www.dailymail.co.uk/';
|
||||
url[] = 'http://www.ign.com/';
|
||||
url[] = 'http://www.mozilla.org/';
|
||||
url[] = 'http://www.vimeo.com/';
|
||||
url[] = 'http://www.wsj.com/';
|
||||
url[] = 'http://www.walmart.com/';
|
||||
url[] = 'http://www.reuters.com/';
|
||||
url[] = 'http://www.usps.com/';
|
||||
url[] = 'http://www.telegraph.co.uk/';
|
||||
url[] = 'http://www.babylon.com/';
|
||||
url[] = 'http://www.ups.com/';
|
||||
url[] = 'http://www.mapquest.com/';
|
||||
url[] = 'http://www.reddit.com/';
|
||||
url[] = 'http://www.theplanet.com/';
|
||||
url[] = 'http://bestbuy.com/';
|
||||
url[] = 'http://www.verizon.net/';
|
||||
url[] = 'http://www.onemanga.com/';
|
||||
url[] = 'http://www.latimes.com/';
|
||||
url[] = 'http://www.washingtonpost.com/';
|
||||
url[] = 'http://www.att.com/';
|
||||
url[] = 'http://www.w3schools.com/';
|
||||
url[] = 'http://www.fox.com/';
|
||||
url[] = 'http://www.ibm.com/';
|
||||
url[] = 'http://www.engadget.com/';
|
||||
url[] = 'http://www.usatoday.com/';
|
||||
url[] = 'http://www.chase.com/';
|
||||
url[] = 'http://www.wellsfargo.com/';
|
||||
url[] = 'http://www.nih.gov';
|
||||
url[] = 'http://www.irs.gov/';
|
||||
url[] = 'http://www.ftb.ca.gov/';
|
||||
url[] = 'http://www.monster.com/';
|
||||
url[] = 'http://www.timesonline.co.uk/';
|
||||
url[] = 'http://www.careerbuilder.com/';
|
||||
url[] = 'http://www.icq.com/';
|
||||
url[] = 'http://www.abcnews.go.com/';
|
||||
url[] = 'http://www.tmz.com/';
|
||||
url[] = 'http://www.fedex.com/';
|
||||
url[] = 'http://www.informer.com/';
|
||||
url[] = 'http://www.snopes.com/';
|
||||
url[] = 'http://www.urbandictionary.com/';
|
||||
url[] = 'http://www.slashdot.org/';
|
||||
url[] = 'http://www.php.net/';
|
||||
url[] = 'http://www.intuit.com/';
|
||||
url[] = 'http://www.thesun.co.uk/';
|
||||
|
||||
[page_rules]
|
||||
|
||||
[indexing_plugins]
|
671
src/controllers/AdminController.php
Normal file
671
src/controllers/AdminController.php
Normal file
|
@ -0,0 +1,671 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\UrlParser;
|
||||
use seekquarry\yioop\library\PageRuleParser;
|
||||
use seekquarry\yioop\library\Classifiers\Classifier;
|
||||
use seekquarry\yioop\library\CrawlDaemon;
|
||||
|
||||
/**
|
||||
* Controller used to handle admin functionalities such as
|
||||
* modify login and password, CREATE, UPDATE,DELETE operations
|
||||
* for users, roles, locale, and crawls
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class AdminController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Says which activities (roughly methods invoke from the web) this
|
||||
* controller will respond to (note: more activities will be loaded from
|
||||
* components)
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["crawlStatus", "machineStatus", "signout"];
|
||||
/**
|
||||
* An array of activities which are periodically updated within other
|
||||
* activities that they live. For example, within manage crawl,
|
||||
* the current crawl status is updated every 20 or so seconds.
|
||||
* @var array
|
||||
*/
|
||||
public $status_activities = ["crawlStatus", "machineStatus"];
|
||||
/**
|
||||
* Associative array of $components activities for this controller
|
||||
* Components are collections of activities (a little like traits) which
|
||||
* can be reused.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $component_activities = [
|
||||
"accountaccess" =>
|
||||
["signin", "manageAccount", "manageUsers", "manageRoles"],
|
||||
"crawl" => ["manageCrawls", "pageOptions", "searchSources",
|
||||
"resultsEditor", "scrapers", "manageClassifiers"],
|
||||
"social" => ["manageGroups", "groupFeeds", "mixCrawls", "wiki"],
|
||||
"advertisement" => ["manageCredits", "manageAdvertisements"],
|
||||
"system" => ["manageMachines", "manageLocales", "serverSettings",
|
||||
"security", "appearance", "configure"]
|
||||
];
|
||||
/**
|
||||
* This is the main entry point for handling requests to administer the
|
||||
* Yioop/SeekQuarry site
|
||||
*
|
||||
* ProcessRequest determines the type of request (signin , manageAccount,
|
||||
* etc) is being made. It then calls the appropriate method to handle the
|
||||
* given activity. Finally, it draws the relevant admin screen
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
if (!C\PROFILE) {
|
||||
return $this->configureRequest();
|
||||
}
|
||||
$view = "signin";
|
||||
if (isset($_SESSION['USER_ID'])) {
|
||||
$user = $_SESSION['USER_ID'];
|
||||
} else {
|
||||
$user = $_SERVER['REMOTE_ADDR'];
|
||||
}
|
||||
$data['SCRIPT'] = "";
|
||||
$data[C\CSRF_TOKEN] = $this->generateCSRFToken($user);
|
||||
$token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user);
|
||||
if ($token_okay || isset($_REQUEST['u'])) {
|
||||
if (isset($_SESSION['USER_ID']) && !isset($_REQUEST['u'])) {
|
||||
$data = array_merge($data, $this->processSession());
|
||||
if (!isset($data['REFRESH'])) {
|
||||
$view = "admin";
|
||||
} else {
|
||||
$view = $data['REFRESH'];
|
||||
}
|
||||
} else if (!isset($_SESSION['REMOTE_ADDR'])
|
||||
&& !isset($_REQUEST['u'])) {
|
||||
$data['SCRIPT'] = "doMessage('<h1 class=\"red\" >".
|
||||
tl('admin_controller_need_cookies')."</h1>');";
|
||||
unset($_SESSION['USER_ID']);
|
||||
} else if ($this->checkSignin()) {
|
||||
if (!isset($_SESSION['AUTH_COUNT']) ||
|
||||
isset($_REQUEST['round_num']) &&
|
||||
$_REQUEST['round_num'] < $_SESSION['AUTH_COUNT']) {
|
||||
$_SESSION['AUTH_COUNT'] = 0;
|
||||
}
|
||||
if (C\AUTHENTICATION_MODE == C\ZKP_AUTHENTICATION) {
|
||||
$_SESSION['AUTH_COUNT']++;
|
||||
if ($_SESSION['AUTH_COUNT'] != C\FIAT_SHAMIR_ITERATIONS) {
|
||||
$_SESSION['SALT_VALUE'] = rand(0, 1);
|
||||
$salt_value = $_SESSION['SALT_VALUE'];
|
||||
if ($_SESSION['AUTH_COUNT'] ==
|
||||
C\FIAT_SHAMIR_ITERATIONS - 1) {
|
||||
$salt_value = "done".$salt_value;
|
||||
}
|
||||
e($salt_value);
|
||||
exit();
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
if not doing Fiat Shamir pretend have gone through all
|
||||
needed iterations
|
||||
*/
|
||||
$_SESSION['AUTH_COUNT'] = C\FIAT_SHAMIR_ITERATIONS;
|
||||
}
|
||||
$_SESSION['USER_NAME'] = $_REQUEST['u'];
|
||||
// successful login.
|
||||
if ($_SESSION['AUTH_COUNT'] == C\FIAT_SHAMIR_ITERATIONS) {
|
||||
$_SESSION['AUTH_COUNT'] = 0;
|
||||
$user_id = $this->model("signin")->getUserId(
|
||||
$this->clean($_REQUEST['u'], "string"));
|
||||
$session = $this->model("user")->getUserSession($user_id);
|
||||
if (isset($_SESSION['LAST_ACTIVITY']) &&
|
||||
is_array($_SESSION['LAST_ACTIVITY'])) {
|
||||
$_REQUEST = array_merge($_REQUEST,
|
||||
$_SESSION['LAST_ACTIVITY']);
|
||||
}
|
||||
if (is_array($session)) {
|
||||
$_SESSION = $session;
|
||||
}
|
||||
$allowed_activities =
|
||||
$this->model("user")->getUserActivities($user_id);
|
||||
// now don't want to use remote address anymore
|
||||
if (!$allowed_activities) {
|
||||
unset($_SESSION['USER_ID']);
|
||||
unset($_REQUEST);
|
||||
$_REQUEST['c'] = "admin";
|
||||
return $this->redirectWithMessage(
|
||||
tl('admin_controller_account_not_active'));
|
||||
} else {
|
||||
$_SESSION['USER_ID'] = $user_id;
|
||||
$_REQUEST[C\CSRF_TOKEN] = $this->generateCSRFToken(
|
||||
$_SESSION['USER_ID']);
|
||||
$preserve_array = [];
|
||||
if (!empty($_REQUEST['preserve']) &&
|
||||
$_REQUEST['preserve'] == 'true') {
|
||||
$preserve_array = [
|
||||
'a','arg', 'filter', 'group_id',
|
||||
'just_group_id', 'visible_users', 'user_filter'
|
||||
];
|
||||
}
|
||||
return $this->redirectWithMessage(
|
||||
tl('admin_controller_login_successful'),
|
||||
$preserve_array);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$alt_message = false;
|
||||
$_SESSION['AUTH_COUNT'] = 0;
|
||||
if (C\AUTHENTICATION_MODE == C\ZKP_AUTHENTICATION
|
||||
&& !isset($_SESSION['AUTH_FAILED'])) {
|
||||
if (isset($_REQUEST['round_num'])) {
|
||||
$_SESSION['SALT_VALUE'] = 1;
|
||||
$_SESSION['AUTH_FAILED'] = -1;
|
||||
e($_SESSION['AUTH_FAILED']);
|
||||
exit();
|
||||
} else {
|
||||
unset($_SESSION['USER_ID']);
|
||||
unset($_SESSION['AUTH_FAILED']);
|
||||
unset($_REQUEST);
|
||||
$_REQUEST['c'] = "admin";
|
||||
return $this->redirectWithMessage(
|
||||
tl('admin_controller_no_back_button'));
|
||||
}
|
||||
}
|
||||
if (!$alt_message) {
|
||||
unset($_SESSION['USER_ID']);
|
||||
unset($_SESSION['AUTH_FAILED']);
|
||||
$login_attempted = false;
|
||||
if (isset($_REQUEST['u'])) {
|
||||
$login_attempted = true;
|
||||
}
|
||||
unset($_REQUEST);
|
||||
$_REQUEST['c'] = "admin";
|
||||
if ($login_attempted) {
|
||||
return $this->redirectWithMessage(
|
||||
tl('admin_controller_login_failed'));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if ($this->checkCSRFToken(C\CSRF_TOKEN, "config")) {
|
||||
$data['SCRIPT'] = "doMessage('<h1 class=\"red\" >".
|
||||
tl('admin_controller_login_to_config')."</h1>')";
|
||||
} else if (isset($_REQUEST['a']) &&
|
||||
in_array($_REQUEST['a'], $this->status_activities)) {
|
||||
e("<p class='red'>".
|
||||
tl('admin_controller_status_updates_stopped')."</p>");
|
||||
exit();
|
||||
}
|
||||
if ($token_okay && isset($_SESSION["USER_ID"])) {
|
||||
$data["ADMIN"] = true;
|
||||
} else {
|
||||
$data["ADMIN"] = false;
|
||||
}
|
||||
if ($view == 'signin') {
|
||||
if (C\AUTHENTICATION_MODE == C\ZKP_AUTHENTICATION) {
|
||||
$data['AUTH_ITERATION'] = C\FIAT_SHAMIR_ITERATIONS;
|
||||
$data['FIAT_SHAMIR_MODULUS'] = C\FIAT_SHAMIR_MODULUS;
|
||||
$_SESSION['SALT_VALUE'] = rand(0, 1);
|
||||
$data['INCLUDE_SCRIPTS'] = ["zkp", "big_int", "sha1"];
|
||||
} else {
|
||||
unset($_SESSION['SALT_VALUE']);
|
||||
}
|
||||
$data[C\CSRF_TOKEN] = $this->generateCSRFToken(
|
||||
$_SERVER['REMOTE_ADDR']);
|
||||
$data['SCRIPT'] .= "var u; if ((u = elt('username')) && u.focus) ".
|
||||
"u.focus();";
|
||||
}
|
||||
$_SESSION['REMOTE_ADDR'] = $_SERVER['REMOTE_ADDR'];
|
||||
if (!isset($data["USERNAME"]) && isset($_SESSION['USER_ID'])) {
|
||||
$signin_model = $this->model("signin");
|
||||
$data['USERNAME'] = $signin_model->getUserName(
|
||||
$_SESSION['USER_ID']);
|
||||
}
|
||||
$this->initializeAdFields($data, false);
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* If there is no profile/work directory set up then this method
|
||||
* get called to by pass any login and go to the configure screen.
|
||||
* The configure screen is only displayed if the user is connected
|
||||
* from localhost in this case
|
||||
*/
|
||||
public function configureRequest()
|
||||
{
|
||||
$data = $this->processSession();
|
||||
$data[C\CSRF_TOKEN] = $this->generateCSRFToken("config");
|
||||
$this->displayView("admin", $data);
|
||||
}
|
||||
/**
|
||||
* Checks whether the user name and password sent presumably by the signin
|
||||
* form match a user in the database
|
||||
*
|
||||
* @return bool whether they do or not
|
||||
*/
|
||||
public function checkSignin()
|
||||
{
|
||||
if (C\AUTHENTICATION_MODE == C\NORMAL_AUTHENTICATION) {
|
||||
$result = false;
|
||||
if (isset($_REQUEST['u']) && isset($_REQUEST['p']) ) {
|
||||
$result = $this->model("signin")->checkValidSignin(
|
||||
$this->clean($_REQUEST['u'], "string"),
|
||||
$this->clean($_REQUEST['p'], "string") );
|
||||
}
|
||||
} else {
|
||||
if (!isset($_REQUEST['u']) || !isset($_REQUEST['x']) ||
|
||||
!isset($_REQUEST['y']) || !isset($_SESSION['SALT_VALUE']) ||
|
||||
isset($_SESSION['AUTH_FAILED'])) {
|
||||
$result = false;
|
||||
} else {
|
||||
$result = $this->model("signin")->checkValidSigninForZKP(
|
||||
$this->clean($_REQUEST['u'], "string"),
|
||||
$this->clean($_REQUEST['x'], "string"),
|
||||
$this->clean($_REQUEST['y'], "string"),
|
||||
$_SESSION['SALT_VALUE'], C\FIAT_SHAMIR_MODULUS);
|
||||
}
|
||||
if (!$result) {
|
||||
$_SESSION['AUTH_COUNT'] = 0;
|
||||
}
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
/**
|
||||
* Determines the user's current allowed activities and current activity,
|
||||
* then calls the method for the latter.
|
||||
*
|
||||
* This is called from {@link processRequest()} once a user is logged in.
|
||||
*
|
||||
* @return array $data the results of doing the activity for display in the
|
||||
* view
|
||||
*/
|
||||
public function processSession()
|
||||
{
|
||||
$allowed = false;
|
||||
if (!C\PROFILE || (C\nsdefined("FIX_NAME_SERVER") &&
|
||||
C\FIX_NAME_SERVER)) {
|
||||
$activity = "configure";
|
||||
} else if (isset($_REQUEST['a']) &&
|
||||
in_array($_REQUEST['a'], $this->activities)) {
|
||||
$activity = $_REQUEST['a'];
|
||||
} else {
|
||||
$activity = "manageAccount";
|
||||
}
|
||||
$activity_model = $this->model("activity");
|
||||
if (!C\PROFILE) {
|
||||
$allowed_activities = [ [
|
||||
"ACTIVITY_NAME" =>
|
||||
$activity_model->getActivityNameFromMethodName($activity),
|
||||
'METHOD_NAME' => $activity]];
|
||||
$allowed = true;
|
||||
} else {
|
||||
$allowed_activities =
|
||||
$this->model("user")->getUserActivities($_SESSION['USER_ID']);
|
||||
}
|
||||
if ($allowed_activities == []) {
|
||||
$data['INACTIVE'] = true;
|
||||
return $data;
|
||||
}
|
||||
foreach ($allowed_activities as $allowed_activity) {
|
||||
if ($activity == $allowed_activity['METHOD_NAME']) {
|
||||
$allowed = true;
|
||||
}
|
||||
if ($allowed_activity['METHOD_NAME'] == "manageCrawls" &&
|
||||
$activity == "crawlStatus") {
|
||||
$allowed = true;
|
||||
}
|
||||
if ($allowed_activity['METHOD_NAME'] == "manageMachines" &&
|
||||
$activity == "machineStatus") {
|
||||
$allowed = true;
|
||||
}
|
||||
if ($allowed_activity['METHOD_NAME'] == "groupFeeds" &&
|
||||
$activity == "wiki") {
|
||||
$allowed = true;
|
||||
}
|
||||
}
|
||||
// business role only allows managing advertisements;
|
||||
if (!$allowed && $activity == "manageAccount") {
|
||||
$activity = $allowed_activities[0]['METHOD_NAME'];
|
||||
$_REQUEST["a"] = $activity;
|
||||
$allowed = true;
|
||||
}
|
||||
//for now we allow anyone to get crawlStatus
|
||||
if ($allowed) {
|
||||
$data = $this->call($activity);
|
||||
$data['ACTIVITY_METHOD'] = $activity; //for settings controller
|
||||
if (!is_array($data)) {
|
||||
$data = [];
|
||||
}
|
||||
$data['ACTIVITIES'] = $allowed_activities;
|
||||
}
|
||||
if (!in_array($activity, $this->status_activities)) {
|
||||
$name_activity = $activity;
|
||||
if ($activity == "wiki") {
|
||||
$name_activity = "groupFeeds";
|
||||
}
|
||||
$data['CURRENT_ACTIVITY'] =
|
||||
$activity_model->getActivityNameFromMethodName($name_activity);
|
||||
if (!empty($_REQUEST['TOGGLE_ACTIVITIES'])) {
|
||||
$_SESSION['HIDE_ACTIVITIES'] = (empty(
|
||||
$_SESSION['HIDE_ACTIVITIES'])) ? true : false;
|
||||
$this->model("user")->setUserSession($_SESSION['USER_ID'],
|
||||
$_SESSION);
|
||||
}
|
||||
$data['HIDE_ACTIVITIES'] = empty($_SESSION['HIDE_ACTIVITIES']) ?
|
||||
false : true;
|
||||
}
|
||||
$data['COMPONENT_ACTIVITIES'] = [];
|
||||
$component_translations = [
|
||||
"accountaccess" => tl('admin_controller_account_access'),
|
||||
"social" => tl('admin_controller_social'),
|
||||
"crawl" => tl('admin_controller_crawl_settings'),
|
||||
"system" => tl('admin_controller_system_settings'),
|
||||
"advertisement" => tl('admin_controller_advertisement')
|
||||
];
|
||||
if (isset($data["ACTIVITIES"])) {
|
||||
foreach (self::$component_activities as $component => $activities){
|
||||
foreach ($data["ACTIVITIES"] as $activity) {
|
||||
if (in_array($activity['METHOD_NAME'], $activities)) {
|
||||
$data['COMPONENT_ACTIVITIES'][
|
||||
$component_translations[$component]][] =
|
||||
$activity;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
/**
|
||||
* Used to handle crawlStatus REST activities requesting the status of the
|
||||
* current web crawl
|
||||
*
|
||||
* @return array $data contains crawl status of current crawl as well as
|
||||
* info about prior crawls and which crawl is being used for default
|
||||
* search results
|
||||
*/
|
||||
public function crawlStatus()
|
||||
{
|
||||
$data = [];
|
||||
$data['REFRESH'] = "crawlstatus";
|
||||
$crawl_model = $this->model("crawl");
|
||||
$crawl_time = $crawl_model->getCurrentIndexDatabaseName();
|
||||
if (isset($crawl_time) ) {
|
||||
$data['CURRENT_INDEX'] = (int)$crawl_time;
|
||||
} else {
|
||||
$data['CURRENT_INDEX'] = -1;
|
||||
}
|
||||
$machine_urls = $this->model("machine")->getQueueServerUrls();
|
||||
list($stalled, $status, $data['RECENT_CRAWLS']) =
|
||||
$crawl_model->combinedCrawlInfo($machine_urls);
|
||||
if ($stalled) {
|
||||
$crawl_model->sendStopCrawlMessage($machine_urls);
|
||||
}
|
||||
$data = array_merge($data, $status);
|
||||
$data["CRAWL_RUNNING"] = false;
|
||||
if (!empty($data['CRAWL_TIME'])) {
|
||||
//erase from previous crawl list any active crawl
|
||||
$num_crawls = count($data['RECENT_CRAWLS']);
|
||||
for ($i = 0; $i < $num_crawls; $i++) {
|
||||
if ($data['RECENT_CRAWLS'][$i]['CRAWL_TIME'] ==
|
||||
$data['CRAWL_TIME']) {
|
||||
$data['RECENT_CRAWLS'][$i] = false;
|
||||
}
|
||||
}
|
||||
$data["CRAWL_RUNNING"] = true;
|
||||
$data['RECENT_CRAWLS']= array_filter($data['RECENT_CRAWLS']);
|
||||
}
|
||||
if (isset($data['RECENT_CRAWLS'][0])) {
|
||||
L\rorderCallback($data['RECENT_CRAWLS'][0],
|
||||
$data['RECENT_CRAWLS'][0], 'CRAWL_TIME');
|
||||
usort($data['RECENT_CRAWLS'], C\NS_LIB . "rorderCallback");
|
||||
}
|
||||
$this->pagingLogic($data, 'RECENT_CRAWLS', 'RECENT_CRAWLS',
|
||||
C\DEFAULT_ADMIN_PAGING_NUM);
|
||||
return $data;
|
||||
}
|
||||
/**
|
||||
* Gets data from the machine model concerning the on/off states
|
||||
* of the machines managed by this Yioop instance and then passes
|
||||
* this data the the machinestatus view.
|
||||
* @return array $data MACHINES field has information about each
|
||||
* machine managed by this Yioop instance as well the on off
|
||||
* status of its queue_servers and fetchers.
|
||||
* The REFRESH field is used to tell the controller that the
|
||||
* view shouldn't have its own sidemenu.
|
||||
*/
|
||||
public function machineStatus()
|
||||
{
|
||||
$data = [];
|
||||
$data['REFRESH'] = "machinestatus";
|
||||
$this->pagingLogic($data, $this->model("machine"), 'MACHINES',
|
||||
C\DEFAULT_ADMIN_PAGING_NUM);
|
||||
$profile = $this->model("profile")->getProfile(C\WORK_DIRECTORY);
|
||||
$media_mode = isset($profile['MEDIA_MODE']) ?
|
||||
$profile['MEDIA_MODE']: "name_server";
|
||||
$data['MEDIA_MODE'] = $media_mode;
|
||||
if ($data['MEDIA_MODE'] == "name_server" &&
|
||||
$data['MACHINES']['NAME_SERVER']["MEDIA_UPDATER_TURNED_ON"] &&
|
||||
$data['MACHINES']['NAME_SERVER']["MediaUpdater"] == 0) {
|
||||
// try to restart news server if dead
|
||||
CrawlDaemon::start("MediaUpdater", 'none', "", -1);
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
/**
|
||||
* Used to update the yioop installation profile based on $_REQUEST data
|
||||
*
|
||||
* @param array& $data field data to be sent to the view
|
||||
* @param array& $profile used to contain the current and updated profile
|
||||
* field values
|
||||
* @param array $check_box_fields fields whose data comes from a html
|
||||
* checkbox
|
||||
*/
|
||||
public function updateProfileFields(&$data, &$profile,
|
||||
$check_box_fields = [])
|
||||
{
|
||||
$script_array = ['SIDE_ADSCRIPT', 'TOP_ADSCRIPT', 'GLOBAL_ADSCRIPT'];
|
||||
foreach ($script_array as $value) {
|
||||
if (isset($_REQUEST[$value])) {
|
||||
$_REQUEST[$value] = str_replace("(","(",$_REQUEST[$value]);
|
||||
$_REQUEST[$value] = str_replace(")",")",$_REQUEST[$value]);
|
||||
}
|
||||
}
|
||||
$color_fields = ['BACKGROUND_COLOR', 'FOREGROUND_COLOR',
|
||||
'SIDEBAR_COLOR', 'TOPBAR_COLOR'];
|
||||
foreach ($this->model("profile")->profile_fields as $field) {
|
||||
if (isset($_REQUEST[$field])) {
|
||||
if ($field != "ROBOT_DESCRIPTION" &&
|
||||
$field != "MEMCACHE_SERVERS" &&
|
||||
$field != "PROXY_SERVERS") {
|
||||
if (in_array($field, $color_fields)) {
|
||||
$clean_value =
|
||||
$this->clean($_REQUEST[$field], "color");
|
||||
} else {
|
||||
$clean_value =
|
||||
$this->clean($_REQUEST[$field], "string");
|
||||
}
|
||||
} else {
|
||||
$clean_value = $_REQUEST[$field];
|
||||
}
|
||||
if ($field == "NAME_SERVER" &&
|
||||
$clean_value[strlen($clean_value) -1] != "/") {
|
||||
$clean_value .= "/";
|
||||
}
|
||||
$data[$field] = $clean_value;
|
||||
$profile[$field] = $data[$field];
|
||||
if ($field == "MEMCACHE_SERVERS" || $field == "PROXY_SERVERS"){
|
||||
$mem_array = preg_split("/(\s)+/", $clean_value);
|
||||
$profile[$field] =
|
||||
$this->convertArrayLines(
|
||||
$mem_array, "|Z|", true);
|
||||
}
|
||||
}
|
||||
if (!isset($data[$field])) {
|
||||
if (defined($field) && !in_array($field, $check_box_fields)) {
|
||||
$data[$field] = constant($field);
|
||||
} else {
|
||||
$data[$field] = "";
|
||||
}
|
||||
if (in_array($field, $check_box_fields)) {
|
||||
$profile[$field] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to set up view data for table search form (might make use of
|
||||
* $_REQUEST if form was submitted, results gotten, and we want to preserve
|
||||
* form drop down). Table search forms
|
||||
* are used by manageUsers, manageRoles, manageGroups, to do advanced
|
||||
* search of the entity they are responsible for.
|
||||
*
|
||||
* @param array& $data modified to contain the field data needed for
|
||||
* the view to draw the search form
|
||||
* @param string activity in which this search is being conducted
|
||||
* @param array $comparison_fields those fields of the entity
|
||||
* in question ( for example, users) which we can search both with
|
||||
* string comparison operators and equality operators
|
||||
* @param array $equal_comparison_fields those fields of the entity in
|
||||
* question which can only be search by equality/inequality operators
|
||||
* @param string $field_postfix suffix to append onto field names in
|
||||
* case there are multiple forms on the same page
|
||||
*/
|
||||
public function tableSearchRequestHandler(&$data, $activity,
|
||||
$comparison_fields = [], $equal_comparison_fields = [],
|
||||
$field_postfix = "")
|
||||
{
|
||||
$data['FORM_TYPE'] = "search";
|
||||
$activity_postfix = $activity . $field_postfix;
|
||||
$data['COMPARISON_TYPES'] = [
|
||||
"=" => tl('admin_controller_equal'),
|
||||
"!=" => tl('admin_controller_not_equal'),
|
||||
"CONTAINS" => tl('admin_controller_contains'),
|
||||
"BEGINS WITH" => tl('admin_controller_begins_with'),
|
||||
"ENDS WITH" => tl('admin_controller_ends_with'),
|
||||
];
|
||||
$_SESSION['SEARCH'][$activity_postfix]['COMPARISON_TYPES'] =
|
||||
$data['COMPARISON_TYPES'];
|
||||
$data['EQUAL_COMPARISON_TYPES'] = [
|
||||
"=" => tl('admin_controller_equal'),
|
||||
"!=" => tl('admin_controller_not_equal'),
|
||||
];
|
||||
$_SESSION['SEARCH'][$activity_postfix]['EQUAL_COMPARISON_TYPES'] =
|
||||
$data['EQUAL_COMPARISON_TYPES'];
|
||||
$data['SORT_TYPES'] = [
|
||||
"NONE" => tl('admin_controller_no_sort'),
|
||||
"ASC" => tl('admin_controller_sort_ascending'),
|
||||
"DESC" => tl('admin_controller_sort_descending'),
|
||||
];
|
||||
$_SESSION['SEARCH'][$activity_postfix]['SORT_TYPES'] =
|
||||
$data['SORT_TYPES'];
|
||||
$paging = "";
|
||||
foreach ($comparison_fields as $comparison_start) {
|
||||
$comparison = $comparison_start."_comparison";
|
||||
$comparison_types = (in_array($comparison_start,
|
||||
$equal_comparison_fields))
|
||||
? 'EQUAL_COMPARISON_TYPES' : 'COMPARISON_TYPES';
|
||||
$data[$comparison] = (isset($_REQUEST[$comparison]) &&
|
||||
isset($data[$comparison_types][
|
||||
$_REQUEST[$comparison]])) ? $_REQUEST[$comparison] :
|
||||
"=";
|
||||
$_SESSION['SEARCH'][$activity_postfix]['COMPARISON_FIELDS'
|
||||
][$comparison] = $data[$comparison];
|
||||
$paging .= "&$comparison=".
|
||||
urlencode($data[$comparison]);
|
||||
}
|
||||
foreach ($comparison_fields as $sort_start) {
|
||||
$sort = $sort_start."_sort";
|
||||
$data[$sort] = (isset($_REQUEST[$sort]) &&
|
||||
isset($data['SORT_TYPES'][
|
||||
$_REQUEST[$sort]])) ? $_REQUEST[$sort] :
|
||||
"NONE";
|
||||
$_SESSION['SEARCH'][$activity_postfix]['SORT'][$sort] =
|
||||
$data[$sort];
|
||||
$paging .= "&$sort=".urlencode($data[$sort]);
|
||||
}
|
||||
$search_array = [];
|
||||
foreach ($comparison_fields as $field) {
|
||||
$field_name = $field.$field_postfix;
|
||||
$field_comparison = $field."_comparison";
|
||||
$field_sort = $field."_sort";
|
||||
$data[$field_name] = (isset($_REQUEST[$field_name])) ?
|
||||
$this->clean($_REQUEST[$field_name], "string") :
|
||||
"";
|
||||
$_SESSION['SEARCH'][$activity_postfix]['FIELD_NAMES'
|
||||
][$field_name] = $data[$field_name];
|
||||
if ($field_name=='access' && $data[$field_name] >= 10) {
|
||||
$search_array[] = ["status",
|
||||
$data[$field_comparison], $data[$field_name]/10,
|
||||
$data[$field_sort]];
|
||||
} else {
|
||||
$search_array[] = [$field,
|
||||
$data[$field_comparison], $data[$field_name],
|
||||
$data[$field_sort]];
|
||||
}
|
||||
$paging .= "&$field_name=".
|
||||
urlencode($data[$field_name]);
|
||||
}
|
||||
$data['PAGING'] = $paging;
|
||||
$_SESSION['SEARCH'][$activity_postfix]['SEARCH_ARRAY'] =
|
||||
$search_array;
|
||||
$_SESSION['SEARCH'][$activity_postfix]['PAGING'] =
|
||||
$data['PAGING'];
|
||||
return $search_array;
|
||||
}
|
||||
/**
|
||||
* For activity involving items for which one can do search (user, group,
|
||||
* roles) this method is used to marshal the last search that was performed
|
||||
* out of the session when one navigates back to search
|
||||
*
|
||||
* @param array &$data field variables used by view to draw itself
|
||||
* @param string $activity current activity marshalling last search for
|
||||
* @param string $field_postfix some activities support multiple search
|
||||
* forms. The field postfix is used to select among these.
|
||||
*/
|
||||
function restoreLastSearchFromSession(&$data, $activity,
|
||||
$field_postfix = "")
|
||||
{
|
||||
$activity_postfix = $activity . $field_postfix;
|
||||
if (empty($_SESSION['LAST_SEARCH'][$activity_postfix])) {
|
||||
return;
|
||||
}
|
||||
$last_search = $_SESSION['LAST_SEARCH'][$activity_postfix];
|
||||
foreach (['COMPARISON_TYPES', 'EQUAL_COMPARISON_TYPES',
|
||||
'SORT_TYPES', 'SEARCH_ARRAY', 'PAGING'] as $field) {
|
||||
$data[$field] = (empty($last_search[$field])) ? [] :
|
||||
$last_search[$field];
|
||||
}
|
||||
foreach (['COMPARISON_FIELDS', 'SORT', 'FIELD_NAMES'] as $field) {
|
||||
foreach ($last_search[$field] as $name => $value) {
|
||||
$data[$name] = $value;
|
||||
}
|
||||
}
|
||||
return $data['SEARCH_ARRAY'];
|
||||
}
|
||||
}
|
115
src/controllers/ApiController.php
Normal file
115
src/controllers/ApiController.php
Normal file
|
@ -0,0 +1,115 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Eswara Rajesh Pinapala epinapala@live.com
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\WikiPaser;
|
||||
|
||||
/**
|
||||
* Controller used to handle user group activities outside of
|
||||
* the admin panel setting. This either could be because the admin panel
|
||||
* is "collapsed" or because the request concerns a wiki page.
|
||||
*
|
||||
* @author Eswara Rajesh Pinapala
|
||||
*/
|
||||
class ApiController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Associative array of $components activities for this controller
|
||||
* Components are collections of activities (a little like traits) which
|
||||
* can be reused.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $component_activities = [ "social" => ["wiki"] ];
|
||||
/**
|
||||
* Used to process requests related to user group activities outside of
|
||||
* the admin panel setting. This either could be because the admin panel
|
||||
* is "collapsed" or because the request concerns a wiki page.
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
if (!C\PROFILE) {
|
||||
return $this->configureRequest();
|
||||
}
|
||||
if (isset($_SESSION['USER_ID'])) {
|
||||
$user_id = $_SESSION['USER_ID'];
|
||||
$data['ADMIN'] = 1;
|
||||
} else {
|
||||
$user_id = $_SERVER['REMOTE_ADDR'];
|
||||
}
|
||||
$data['SCRIPT'] = "";
|
||||
$token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user_id);
|
||||
|
||||
$data = array_merge($data, $this->processSession());
|
||||
|
||||
if (isset($data["VIEW"])) {
|
||||
$view = $data["VIEW"];
|
||||
} else {
|
||||
$view = 'api';
|
||||
}
|
||||
$_SESSION['REMOTE_ADDR'] = $_SERVER['REMOTE_ADDR'];
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* Used to perform the actual activity call to be done by the
|
||||
* api_controller.
|
||||
* processSession is called from @see processRequest, which does some
|
||||
* cleaning of fields if the CSRFToken is not valid. It is more likely
|
||||
* that that api_controller may be involved in such requests as it can
|
||||
* be invoked either when a user is logged in or not and for users with and
|
||||
* without accounts. processSession makes sure the $_REQUEST'd activity is
|
||||
* valid (or falls back to groupFeeds) then calls it. If someone uses
|
||||
* the Settings link to change the language or default number of feed
|
||||
* elements to view, this method sets up the $data variable so that
|
||||
* the back/cancel button on that page works correctly.
|
||||
*/
|
||||
public function processSession()
|
||||
{
|
||||
if (isset($_REQUEST['a']) &&
|
||||
in_array($_REQUEST['a'], $this->activities)) {
|
||||
$activity = $this->clean($_REQUEST['a'],"string");
|
||||
} else {
|
||||
$activity = "groupFeeds";
|
||||
}
|
||||
$data = $this->call($activity);
|
||||
$data['ACTIVITY_CONTROLLER'] = "group";
|
||||
$data['PAGE_TITLE'] = $this->clean($_REQUEST['page_name'],"string");
|
||||
$data['ACTIVITY_METHOD'] = $activity; //for settings controller
|
||||
if (!is_array($data)) {
|
||||
$data = [];
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
}
|
||||
|
92
src/controllers/ArchiveController.php
Normal file
92
src/controllers/ArchiveController.php
Normal file
|
@ -0,0 +1,92 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\WebArchiveBundle;
|
||||
|
||||
/**
|
||||
* Fetcher machines also act as archives for complete caches of web pages,
|
||||
* this controller is used to handle access to these web page caches
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class ArchiveController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* The only legal activity this controller will accept is a request
|
||||
* for the cache of a web page
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["cache"];
|
||||
|
||||
/**
|
||||
* Main method for this controller to handle requests. It first checks
|
||||
* the request is valid, and then handles the corresponding activity
|
||||
*
|
||||
* For this controller the only activity is to handle a cache request
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
/* do a quick test to see if this is a request seems like from a
|
||||
legitimate machine
|
||||
*/
|
||||
if (!$this->checkRequest()) {return; }
|
||||
$activity = $this->clean($_REQUEST['a'], "string");
|
||||
$this->call($activity);
|
||||
}
|
||||
/**
|
||||
* Retrieves the requested page from the WebArchiveBundle and echo it page,
|
||||
* base64 encoded
|
||||
*/
|
||||
public function cache()
|
||||
{
|
||||
$offset = $this->clean($_REQUEST['offset'], "int");
|
||||
$partition = $this->clean($_REQUEST['partition'], "int");
|
||||
$crawl_time = $this->clean($_REQUEST['crawl_time'], "string");
|
||||
$prefix = "";
|
||||
if (isset($_REQUEST['instance_num'])) {
|
||||
$prefix = $this->clean($_REQUEST['instance_num'], "int")."-";
|
||||
}
|
||||
if (file_exists(C\CRAWL_DIR.'/cache/'.$prefix.self::archive_base_name.
|
||||
$crawl_time)) {
|
||||
$web_archive = new WebArchiveBundle(
|
||||
C\CRAWL_DIR.'/cache/'.$prefix.self::archive_base_name.
|
||||
$crawl_time);
|
||||
$page = $web_archive->getPage($offset, $partition);
|
||||
echo base64_encode(serialize($page));
|
||||
} else {
|
||||
echo base64_encode(serialize(false));
|
||||
}
|
||||
}
|
||||
}
|
315
src/controllers/ClassifierController.php
Normal file
315
src/controllers/ClassifierController.php
Normal file
|
@ -0,0 +1,315 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\UrlParser;
|
||||
use seekquarry\yioop\library\classifiers\Classifier;
|
||||
use seekquarry\yioop\library\archive_bundle_iterators\MixArchiveBundleIterator;
|
||||
|
||||
/**
|
||||
* This class handles XmlHttpRequests to label documents during classifier
|
||||
* construction.
|
||||
*
|
||||
* Searching for new documents to label and add to the training set is a
|
||||
* heavily-interactive operation, so it is implemented using asynchronous
|
||||
* requests to this controller in order to fetch candidates for labeling and
|
||||
* add labels without reloading the classifier edit page. The admin controller
|
||||
* takes care of first displaying the "edit classifier" page, and handles
|
||||
* requests to change a classifier's class label, but this controller handles
|
||||
* the other asynchronous requests issued by the JavaScript on the page.
|
||||
*
|
||||
* @author Shawn Tice
|
||||
*/
|
||||
class ClassifierController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* These are the activities supported by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["classify"];
|
||||
/**
|
||||
* Checks that the request seems to be coming from a legitimate, logged-in
|
||||
* user, then dispatches to the appropriate activity.
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
if (!isset($_REQUEST['a']) || !$this->checkRequest()) {return;}
|
||||
$activity = $_REQUEST['a'];
|
||||
if (in_array($activity, $this->activities)) {
|
||||
$this->call($activity);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Finds the next document for which to request a label, sometimes first
|
||||
* recording the label that the user selected for the last document. This
|
||||
* method should only be called via an XmlHttpRequest initiated by the edit
|
||||
* classifier JavaScript, and consequently it always writes out
|
||||
* JSON-encoded data, which is easily decoded by the page JavaScript.
|
||||
*/
|
||||
public function classify()
|
||||
{
|
||||
$arg = $this->clean($_REQUEST['arg'], 'string');
|
||||
$label = $this->clean($_REQUEST['label'], 'string');
|
||||
|
||||
if (isset($_REQUEST['index'])) {
|
||||
$index = $this->clean($_REQUEST['index'], 'int');
|
||||
if (intval($index) == 1) {
|
||||
$index = $this->model("crawl")->getCurrentIndexDatabaseName();
|
||||
}
|
||||
$source_type = $this->clean($_REQUEST['type'], 'string');
|
||||
$keywords = $this->clean($_REQUEST['keywords'], 'string');
|
||||
}
|
||||
/*
|
||||
The call to prepareToLabel is important; it loads all of the data
|
||||
required to manage the training set from disk, and also determines
|
||||
what will be saved *back* to disk later.
|
||||
*/
|
||||
$classifier = Classifier::getClassifier($label);
|
||||
$classifier->prepareToLabel();
|
||||
$data = [];
|
||||
switch ($arg) {
|
||||
case 'getdocs':
|
||||
/*
|
||||
Load documents in from a user-specified index, and find the
|
||||
next best one to label (for 'manual' source type), or label
|
||||
them all with a single label (for either the 'positive' or
|
||||
'negative' source types).
|
||||
*/
|
||||
$mix_iterator = $this->buildClassifierCrawlMix(
|
||||
$label, $index, $keywords);
|
||||
if ($source_type == 'manual') {
|
||||
$num_docs = $classifier->initBuffer($mix_iterator);
|
||||
$classifier->computeBufferDensities();
|
||||
$data['num_docs'] = $num_docs;
|
||||
list($new_doc, $disagreement) =
|
||||
$classifier->findNextDocumentToLabel();
|
||||
if ($new_doc) {
|
||||
$score = $classifier->classify($new_doc);
|
||||
$data['new_doc'] = $this->prepareUnlabelledDocument(
|
||||
$new_doc, $score, $disagreement,
|
||||
$index, $keywords);
|
||||
}
|
||||
Classifier::setClassifier($classifier);
|
||||
} else if ($source_type == 'positive' ||
|
||||
$source_type == 'negative') {
|
||||
$doc_label = ($source_type == 'positive') ? 1 : -1;
|
||||
$add_count = $classifier->addAllDocuments(
|
||||
$mix_iterator, $doc_label);
|
||||
if ($add_count > 0) {
|
||||
/*
|
||||
Pass true to always update accuracy after adding a
|
||||
batch of documents all at once.
|
||||
*/
|
||||
$classifier->train(true);
|
||||
Classifier::setClassifier($classifier);
|
||||
}
|
||||
$data['add_count'] = $add_count;
|
||||
}
|
||||
break;
|
||||
case 'addlabel':
|
||||
/*
|
||||
First label the last candidate document presented to the
|
||||
user (potentially skipping it instead of actually applying a
|
||||
label), then pick the next best candidate for labeling.
|
||||
When skipping a document instead of adding a label, avoid
|
||||
re-training since the training set hasn't actually changed.
|
||||
*/
|
||||
$doc = $_REQUEST['doc_to_label'];
|
||||
$docid = $this->clean($doc['docid'], 'int');
|
||||
$key = L\webdecode($this->clean($doc['key'], 'string'));
|
||||
$doc_label = $this->clean($doc['label'], 'int');
|
||||
$mix_iterator = $this->retrieveClassifierCrawlMix($label);
|
||||
$labels_changed = $classifier->labelDocument($key, $doc_label);
|
||||
$num_docs = $classifier->refreshBuffer($mix_iterator);
|
||||
$classifier->computeBufferDensities();
|
||||
$data['num_docs'] = $num_docs;
|
||||
if ($labels_changed) {
|
||||
$update_accuracy = $classifier->total > 0 &&
|
||||
$classifier->total % 10 == 0;
|
||||
$classifier->train($update_accuracy);
|
||||
}
|
||||
list($new_doc, $disagreement) =
|
||||
$classifier->findNextDocumentToLabel();
|
||||
if ($new_doc) {
|
||||
$score = $classifier->classify($new_doc);
|
||||
$data['new_doc'] = $this->prepareUnlabelledDocument(
|
||||
$new_doc, $score, $disagreement,
|
||||
$index, $keywords);
|
||||
}
|
||||
Classifier::setClassifier($classifier);
|
||||
break;
|
||||
case 'updateaccuracy':
|
||||
/*
|
||||
Don't do anything other than re-compute the accuracy for the
|
||||
current training set.
|
||||
*/
|
||||
$classifier->updateAccuracy();
|
||||
Classifier::setClassifier($classifier);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
No matter which activity we ended up carrying out, always include
|
||||
the statistics that *might* have changed so that the client can just
|
||||
naively keep them up to date.
|
||||
*/
|
||||
$data['positive'] = $classifier->positive;
|
||||
$data['negative'] = $classifier->negative;
|
||||
$data['total'] = $classifier->total;
|
||||
$data['accuracy'] = $classifier->accuracy;
|
||||
|
||||
/*
|
||||
Pass along a new authentication token so that the client can make a
|
||||
new authenticated request after this one.
|
||||
*/
|
||||
$data['authTime'] = strval(time());
|
||||
$data['authSession'] = md5($data['authTime'] . C\AUTH_KEY);
|
||||
|
||||
$response = json_encode($data);
|
||||
header("Content-Type: application/json");
|
||||
header("Content-Length: ".strlen($response));
|
||||
echo $response;
|
||||
}
|
||||
/* PRIVATE METHODS */
|
||||
/**
|
||||
* Creates a new crawl mix for an existing index, with an optional query,
|
||||
* and returns an iterator for the mix. The crawl mix name is derived from
|
||||
* the class label, so that it can be easily retrieved and deleted later
|
||||
* on.
|
||||
*
|
||||
* @param string $label class label of the classifier the new crawl mix
|
||||
* will be associated with
|
||||
* @param int $crawl_time timestamp of the index to be iterated over
|
||||
* @param string $keywords an optional query used to restrict the pages
|
||||
* retrieved by the crawl mix
|
||||
* @return object A MixArchiveBundleIterator instance that will iterate
|
||||
* over the pages of the requested index
|
||||
*/
|
||||
public function buildClassifierCrawlMix($label, $crawl_time, $keywords)
|
||||
{
|
||||
$crawl_model = $this->model("crawl");
|
||||
$mix_time = time();
|
||||
$mix_name = Classifier::getCrawlMixName($label);
|
||||
|
||||
// Replace any existing crawl mix.
|
||||
$old_time = $crawl_model->getCrawlMixTimestamp($mix_name);
|
||||
if ($old_time) {
|
||||
$crawl_model->deleteCrawlMixIteratorState($old_time);
|
||||
$crawl_model->deleteCrawlMix($old_time);
|
||||
}
|
||||
|
||||
$crawl_model->setCrawlMix(array(
|
||||
'TIMESTAMP' => $mix_time,
|
||||
'NAME' => $mix_name,
|
||||
'OWNER_ID' => $_SESSION['USER_ID'],
|
||||
'PARENT' => -1,
|
||||
'FRAGMENTS' => [
|
||||
['RESULT_BOUND' => 1,
|
||||
'COMPONENTS' => [[
|
||||
'CRAWL_TIMESTAMP' => $crawl_time,
|
||||
'WEIGHT' => 1.0,
|
||||
'KEYWORDS' => $keywords]]]]));
|
||||
return new MixArchiveBundleIterator($mix_time, $mix_time);
|
||||
}
|
||||
/**
|
||||
* Retrieves an iterator for an existing crawl mix. The crawl mix remembers
|
||||
* its previous offset, so that the new iterator picks up where the
|
||||
* previous one left off.
|
||||
*
|
||||
* @param string $label class label of the classifier this crawl mix is
|
||||
* associated with
|
||||
* @return object new MixArchiveBundleIterator instance that picks up where
|
||||
* the previous one left off
|
||||
*/
|
||||
public function retrieveClassifierCrawlMix($label)
|
||||
{
|
||||
$mix_name = Classifier::getCrawlMixName($label);
|
||||
$mix_time = $this->model("crawl")->getCrawlMixTimestamp($mix_name);
|
||||
return new MixArchiveBundleIterator($mix_time, $mix_time);
|
||||
}
|
||||
/**
|
||||
* Creates a fresh array from an existing page summary array, and augments
|
||||
* it with extra data relevant to the labeling interface on the client.
|
||||
*
|
||||
* @param array $page original page summary array
|
||||
* @param float $score classification score (estimated by the Naive Bayes
|
||||
* text classification algorithm) for $page
|
||||
* @param float $disagreement disagreement score computed for $page
|
||||
* @param int $crawl_time index the page came from
|
||||
* @param string $keywords query supplied to the crawl mix used to find
|
||||
* $page
|
||||
* @return array reduced page summary structure containing only the
|
||||
* information that the client needs to display a summary of the page
|
||||
*/
|
||||
public function prepareUnlabelledDocument($page, $score, $disagreement,
|
||||
$crawl_time, $keywords)
|
||||
{
|
||||
$phrase_model = $this->model("phrase");
|
||||
// Highlight the query keywords, if any.
|
||||
$disjunct_phrases = explode("|", $keywords);
|
||||
$words = [];
|
||||
foreach ($disjunct_phrases as $disjunct_phrase) {
|
||||
list($word_struct, $format_words) =
|
||||
$phrase_model->parseWordStructConjunctiveQuery(
|
||||
$disjunct_phrase);
|
||||
$words = array_merge($words, $format_words);
|
||||
}
|
||||
$title = $phrase_model->boldKeywords(
|
||||
$page[self::TITLE], $words);
|
||||
$description = $phrase_model->getSnippets(
|
||||
strip_tags($page[self::DESCRIPTION]), $words, 400);
|
||||
$description = $phrase_model->boldKeywords(
|
||||
$description, $words);
|
||||
$cache_link = "?c=search&a=cache".
|
||||
"&q=".urlencode($keywords).
|
||||
"&arg=".urlencode($page[self::URL]).
|
||||
"&its=".$crawl_time;
|
||||
/*
|
||||
Note that the confidence is a transformation of the score that
|
||||
converts it into a value between 0 and 1, where it's 0 if the score
|
||||
was exactly 0.5, and increases toward 1 as the score either
|
||||
increases toward 1 or decreases toward 0.
|
||||
*/
|
||||
return [
|
||||
'title' => $title,
|
||||
'url' => $page[self::URL],
|
||||
'key' => L\webencode(Classifier::makeKey($page)),
|
||||
'cache_link' => $cache_link,
|
||||
'description' => $description,
|
||||
'score' => $score,
|
||||
'positive' => $score >= 0.5 ? 1 :0,
|
||||
'confidence' => abs($score - 0.5) / 0.5,
|
||||
'disagreement' => $disagreement];
|
||||
}
|
||||
}
|
964
src/controllers/Controller.php
Normal file
964
src/controllers/Controller.php
Normal file
|
@ -0,0 +1,964 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop as B;
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\AnalyticsManager;
|
||||
use seekquarry\yioop\library\UrlParser;
|
||||
use seekquarry\yioop\models\Model;
|
||||
use seekquarry\yioop\controllers\components\Component;
|
||||
use seekquarry\yioop\views\View;
|
||||
|
||||
/**
|
||||
* Load crawlHash and timing functions
|
||||
*/
|
||||
require_once C\BASE_DIR."/library/Utility.php";
|
||||
/** For guessing locale and formatting date based on locale guessed*/
|
||||
require_once C\BASE_DIR."/library/LocaleFunctions.php";
|
||||
/**
|
||||
* Translate the supplied arguments into the current locale.
|
||||
*
|
||||
* This function is a convenience copy of the same function
|
||||
* @see seekquarry\yioop\library\tl() to this subnamespace
|
||||
*
|
||||
* @param string string_identifier identifier to be translated
|
||||
* @param mixed additional_args used for interpolation in translated string
|
||||
* @return string translated string
|
||||
*/
|
||||
function tl()
|
||||
{
|
||||
return call_user_func_array(C\NS_LIB . "tl", func_get_args());
|
||||
}
|
||||
/**
|
||||
* shorthand for echo
|
||||
*
|
||||
* @param string $text string to send to the current output
|
||||
*/
|
||||
function e($text)
|
||||
{
|
||||
echo $text;
|
||||
}
|
||||
/**
|
||||
* Base controller class for all controllers on
|
||||
* the SeekQuarry site.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
abstract class Controller
|
||||
{
|
||||
/**
|
||||
* Array of instances of views used by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $view_instances = [];
|
||||
/**
|
||||
* Array of instances of models used by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $model_instances;
|
||||
/**
|
||||
* Array of instances of indexing_plugins used by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $plugin_instances;
|
||||
/**
|
||||
* Says which activities (roughly methods invoke from the web) this
|
||||
* controller will respond to
|
||||
* @var array
|
||||
*/
|
||||
public $activities = [];
|
||||
/**
|
||||
* Associative array of activity => component activity is on, used
|
||||
* by @see Controller::call method to actually invoke a given activity
|
||||
* on a given component
|
||||
* @var array
|
||||
*/
|
||||
public $activity_component = [];
|
||||
/**
|
||||
* Associative array of $components activities for this controller
|
||||
* Components are collections of activities (a little like traits) which
|
||||
* can be reused.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $component_activities = [];
|
||||
/**
|
||||
* Sets up component activities, instance array, and plugins.
|
||||
*
|
||||
* @param array $indexing_plugins which post processing indexing plugins
|
||||
* are available
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
// this is to force a signout request to signout in all controllers
|
||||
if (isset($_REQUEST['a']) && $_REQUEST['a'] == "signout") {
|
||||
unset($_SESSION['USER_ID']);
|
||||
unset($_REQUEST);
|
||||
return $this->redirectWithMessage(
|
||||
tl('search_controller_logout_successful'));
|
||||
}
|
||||
/* if no signout was made, get on with the business of building
|
||||
this controller
|
||||
*/
|
||||
mb_internal_encoding("UTF-8");
|
||||
mb_regex_encoding("UTF-8");
|
||||
$class = get_class($this);
|
||||
foreach ($class::$component_activities as $component => $activities) {
|
||||
foreach ($activities as $activity) {
|
||||
$this->activity_component[$activity] = $component;
|
||||
$this->activities[] = $activity;
|
||||
}
|
||||
}
|
||||
$this->component_instances = [];
|
||||
$this->plugins_instances = [];
|
||||
$this->model_instances = [];
|
||||
$this->view_instances = [];
|
||||
}
|
||||
/**
|
||||
* This function should be overriden to web handle requests
|
||||
*/
|
||||
public abstract function processRequest();
|
||||
/**
|
||||
* Dynamic loader for Component objects which might live on the current
|
||||
* Component
|
||||
*
|
||||
* @param string $component name of model to return
|
||||
*/
|
||||
public function component($component)
|
||||
{
|
||||
if (!isset($this->component_instances[$component])) {
|
||||
$component_name = C\NS_COMPONENTS . ucfirst($component) .
|
||||
"Component";
|
||||
$this->component_instances[$component] =
|
||||
new $component_name($this);
|
||||
}
|
||||
return $this->component_instances[$component];
|
||||
}
|
||||
/**
|
||||
* Dynamic loader for Model objects which might live on the current
|
||||
* Controller
|
||||
*
|
||||
* @param string $model name of model to return
|
||||
*/
|
||||
public function model($model)
|
||||
{
|
||||
if (!isset($this->model_instances[$model])) {
|
||||
$model_name = C\NS_MODELS . ucfirst($model)."Model";
|
||||
$this->model_instances[$model] = new $model_name();
|
||||
}
|
||||
return $this->model_instances[$model];
|
||||
}
|
||||
/**
|
||||
* Dynamic loader for Plugin objects which might live on the current
|
||||
* Controller
|
||||
*
|
||||
* @param string $plugin name of Plugin to return
|
||||
*/
|
||||
public function plugin($plugin)
|
||||
{
|
||||
if (!isset($this->plugin_instances[$plugin])) {
|
||||
$plugin_name = C\NS_PLUGINS .
|
||||
ucfirst($plugin)."Plugin";
|
||||
$this->plugin_instances[$plugin] = new $plugin_name();
|
||||
}
|
||||
return $this->plugin_instances[$plugin];
|
||||
}
|
||||
/**
|
||||
* Used to get a list of all available indexing plugins for this Yioop
|
||||
* instance.
|
||||
*
|
||||
* @param array elts of which are names of indexing plugins
|
||||
*/
|
||||
public function getIndexingPluginList()
|
||||
{
|
||||
$dir_prefixes = [C\BASE_DIR, C\APP_DIR];
|
||||
foreach ($dir_prefixes as $dir_prefix) {
|
||||
$plugin_dir = $dir_prefix."/library/indexing_plugins/";
|
||||
$plugin_dir_len = strlen($plugin_dir);
|
||||
$plugin_ext_len = strlen("Plugin.php");
|
||||
foreach (glob("$plugin_dir*Plugin.php") as $filename) {
|
||||
$tmp_plug_name = substr($filename, $plugin_dir_len,
|
||||
-$plugin_ext_len);
|
||||
if ($tmp_plug_name != "Indexing") {
|
||||
$indexing_plugins[] = $tmp_plug_name;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $indexing_plugins;
|
||||
}
|
||||
/**
|
||||
* Dynamic loader for View objects which might live on the current
|
||||
* Controller
|
||||
*
|
||||
* @param string $view name of view to return
|
||||
*/
|
||||
public function view($view)
|
||||
{
|
||||
if (!isset($this->view_instances[$view])) {
|
||||
$view_name = C\NS_VIEWS . ucfirst($view)."View";
|
||||
$this->view_instances[$view] = new $view_name();
|
||||
}
|
||||
return $this->view_instances[$view];
|
||||
}
|
||||
/**
|
||||
* Send the provided view to output, drawing it with the given
|
||||
* data variable, using the current locale for translation, and
|
||||
* writing mode
|
||||
*
|
||||
* @param string $view the name of the view to draw
|
||||
* @param array $data an array of values to use in drawing the view
|
||||
*/
|
||||
public function displayView($view, $data)
|
||||
{
|
||||
$data['LOCALE_TAG'] = L\getLocaleTag();
|
||||
$data['LOCALE_DIR'] = L\getLocaleDirection();
|
||||
$data['BLOCK_PROGRESSION'] = L\getBlockProgression();
|
||||
$data['WRITING_MODE'] = L\getWritingMode();
|
||||
if (C\QUERY_STATISTICS) {
|
||||
$data['QUERY_STATISTICS'] = [];
|
||||
$machine = isset($_SERVER["HTTP_HOST"]) ?
|
||||
htmlentities($_SERVER["HTTP_HOST"]) : "localhost";
|
||||
$machine_uri = isset($_SERVER['REQUEST_URI']) ?
|
||||
htmlentities($_SERVER['REQUEST_URI']) : "/";
|
||||
$protocol = (isset($_SERVER["HTTPS"])) ? "https://" : "http://";
|
||||
if ($machine == '::1') { //IPv6 :(
|
||||
$machine = "[::1]/";
|
||||
//used if the fetching and queue serving on the same machine
|
||||
}
|
||||
$data['YIOOP_INSTANCE'] = $protocol . $machine . $machine_uri;
|
||||
$data['TOTAL_ELAPSED_TIME'] = 0;
|
||||
foreach ($this->model_instances as $model_name => $model) {
|
||||
$data['QUERY_STATISTICS'] = array_merge(
|
||||
$model->db->query_log,
|
||||
$data['QUERY_STATISTICS']
|
||||
);
|
||||
$data['TOTAL_ELAPSED_TIME'] +=
|
||||
$model->db->total_time;
|
||||
}
|
||||
$locale_info = L\getLocaleQueryStatistics();
|
||||
$data['QUERY_STATISTICS'] = array_merge(
|
||||
$locale_info['QUERY_LOG'],
|
||||
$data['QUERY_STATISTICS']
|
||||
);
|
||||
$data['TOTAL_ELAPSED_TIME'] +=
|
||||
$locale_info['TOTAL_ELAPSED_TIME'];
|
||||
$mail_total_time = AnalyticsManager::get("MAIL_TOTAL_TIME");
|
||||
$mail_messages = AnalyticsManager::get("MAIL_MESSAGES");
|
||||
if ($mail_total_time && $mail_messages) {
|
||||
$data['QUERY_STATISTICS'] = array_merge($mail_messages,
|
||||
$data['QUERY_STATISTICS']
|
||||
);
|
||||
$data['TOTAL_ELAPSED_TIME'] += $mail_total_time;
|
||||
}
|
||||
}
|
||||
$data['c'] = isset($_REQUEST['c']) ? $_REQUEST['c'] : null;
|
||||
if (isset($_SESSION['DISPLAY_MESSAGE'])) {
|
||||
$data['DISPLAY_MESSAGE'] = $_SESSION['DISPLAY_MESSAGE'];
|
||||
unset($_SESSION['DISPLAY_MESSAGE']);
|
||||
}
|
||||
$this->view($view)->render($data);
|
||||
}
|
||||
/**
|
||||
* Does a 301 redirect to the given location, sets a session variable
|
||||
* to display a message when get there.
|
||||
*
|
||||
* @param string $message message to write
|
||||
* @param string $copy_fields $_REQUEST fields to copy for redirect
|
||||
*/
|
||||
public function redirectWithMessage($message, $copy_fields = false)
|
||||
{
|
||||
$default_fields = ["c", "a", C\CSRF_TOKEN, "just_thread",
|
||||
"just_group_id", "just_user_id", "group_id", "user_id", "role_id",
|
||||
"limit", "num"];
|
||||
if ($copy_fields) {
|
||||
$copy_fields = array_merge($default_fields, $copy_fields);
|
||||
} else {
|
||||
$copy_fields = $default_fields;
|
||||
}
|
||||
$query_array = [];
|
||||
foreach ($copy_fields as $field) {
|
||||
if (isset($_REQUEST[$field])) {
|
||||
if (is_array($_REQUEST[$field])) {
|
||||
$array_params_cleaned = $_REQUEST[$field];
|
||||
foreach ($array_params_cleaned as $key => $value) {
|
||||
$query_array[$field][$this->clean($key, "string")] =
|
||||
$this->clean($value, "string") ;
|
||||
}
|
||||
} else {
|
||||
$query_array[$field] = $this->clean($_REQUEST[$field],
|
||||
"string");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isset($_REQUEST['route'])) {
|
||||
foreach ($query_array as $field => $value) {
|
||||
if (!empty($_REQUEST['route'][$field])) {
|
||||
unset($query_array[$field]);
|
||||
}
|
||||
}
|
||||
}
|
||||
$query_array = array_filter($query_array);
|
||||
$location = ($query_array == []) ? C\BASE_URL :
|
||||
"?" . http_build_query($query_array);
|
||||
if ($message) {
|
||||
$data['MESSAGE'] = $message;
|
||||
$_SESSION['DISPLAY_MESSAGE'] = $message;
|
||||
} else {
|
||||
unset($_SESSION['DISPLAY_MESSAGE']);
|
||||
}
|
||||
if (php_sapi_name() == 'cli') {
|
||||
//this case happens for configure_tool.php
|
||||
return $data;
|
||||
}
|
||||
if (isset($_SERVER['HTTP_X_REQUESTED_WITH']) &&
|
||||
$_SERVER['HTTP_X_REQUESTED_WITH'] == "XMLHttpRequest") {
|
||||
e("go$location");
|
||||
} else {
|
||||
header("Location: $location");
|
||||
}
|
||||
exit();
|
||||
}
|
||||
/**
|
||||
* When an activity involves displaying tabular data (such as rows of
|
||||
* users, groups, etc), this method might be called to set up $data
|
||||
* fields for next, prev, and page links, it also makes the call to the
|
||||
* model to get the row data sorted and restricted as desired. For some
|
||||
* data sources, rather than directly make a call to the model to get the
|
||||
* data it might be passed directly to this method.
|
||||
*
|
||||
* @param array& $data used to send data to the view will be updated by
|
||||
* this method with row and paging data
|
||||
* @param mixed $field_or_model if an object, this is assumed to be a model
|
||||
* and so the getRows method of this model is called to get row data,
|
||||
* sorted and restricted according to $search_array; if a string
|
||||
* then the row data is assumed to be in $data[$field_or_model] and
|
||||
* pagingLogic itself does the sorting and restricting.
|
||||
* @param string $output_field output rows for the view will be stored in
|
||||
* $data[$output_field]
|
||||
* @param int $default_show if not specified by $_REQUEST, then this will
|
||||
* be used to determine the maximum number of rows that will be
|
||||
* written to $data[$output_field]
|
||||
* @param array $search_array used to sort and restrict in
|
||||
* the getRows call or the data from $data[$field_or_model].
|
||||
* Each element of this is a quadruple name of a field, what comparison
|
||||
* to perform, a value to check, and an order (ascending/descending)
|
||||
* to sort by
|
||||
* @param string $var_prefix if there are multiple uses of pagingLogic
|
||||
* presented on the same view then $var_prefix can be prepended to
|
||||
* to the $data field variables like num_show, start_row, end_row
|
||||
* to distinguish between them
|
||||
* @param array $args additional arguments that are passed to getRows and
|
||||
* in turn to selectCallback, fromCallback, and whereCallback that
|
||||
* might provide user_id, etc to further control which rows are
|
||||
* returned
|
||||
*/
|
||||
public function pagingLogic(&$data, $field_or_model, $output_field,
|
||||
$default_show, $search_array = [], $var_prefix = "", $args = null)
|
||||
{
|
||||
$data_fields = [];
|
||||
$r = [];
|
||||
$request_fields = ['num_show' => C\DEFAULT_ADMIN_PAGING_NUM,
|
||||
'start_row' => 0, 'end_row' => C\DEFAULT_ADMIN_PAGING_NUM];
|
||||
foreach ($request_fields as $field => $default) {
|
||||
if (isset($_REQUEST[$var_prefix . $field])) {
|
||||
$r[$field] = $_REQUEST[$var_prefix . $field];
|
||||
} else {
|
||||
$r[$field] = $default;
|
||||
}
|
||||
}
|
||||
if ($r['start_row'] + $r['num_show'] != $r['end_row']) {
|
||||
$r['end_row'] = $r['start_row'] + $r['num_show'];
|
||||
}
|
||||
$d = [];
|
||||
$data_fields = ['NUM_TOTAL', 'NUM_SHOW', 'START_ROW', 'END_ROW',
|
||||
'NEXT_START', 'NEXT_END', 'PREV_START', 'PREV_END'];
|
||||
$var_field = strtoupper($var_prefix);
|
||||
foreach ($data_fields as $field) {
|
||||
$d[$field] = $var_prefix . $field;
|
||||
}
|
||||
$num_show = (isset($r['num_show']) &&
|
||||
isset($this->view("admin")->helper("pagingtable")->show_choices[
|
||||
$r['num_show']])) ? $r['num_show'] : $default_show;
|
||||
$data[$d['NUM_SHOW']] = $num_show;
|
||||
$data[$d['START_ROW']] = isset($r['start_row']) ?
|
||||
max(0, $this->clean($r['start_row'],"int")) : 0;
|
||||
if (is_object($field_or_model)) {
|
||||
$data[$output_field] = $field_or_model->getRows(
|
||||
$data[$d['START_ROW']], $num_show, $num_rows, $search_array,
|
||||
$args);
|
||||
} else {
|
||||
$num_rows = count($data[$field_or_model]);
|
||||
if ($search_array != []) {
|
||||
$out_data = [];
|
||||
foreach ($data[$field_or_model] as $name => $field_data) {
|
||||
$checks_passed = true;
|
||||
foreach ($search_array as $search_data) {
|
||||
list($column_name, $comparison, $search_value, $sort) =
|
||||
$search_data;
|
||||
if ($search_value == "") {continue; }
|
||||
if (isset($args[$column_name])) {
|
||||
$column_name = $args[$column_name];
|
||||
}
|
||||
$row_value = is_object($field_data) ?
|
||||
$field_data->$column_name:
|
||||
$field_data[$column_name];
|
||||
$cmp = strcmp($search_value, $row_value);
|
||||
if (($cmp == 0 && $comparison == "=") ||
|
||||
($cmp != 0 && $comparison == "!=")
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
$pos = strpos($row_value, $search_value);
|
||||
$len_row = strlen($row_value);
|
||||
$len_search = strlen($search_value);
|
||||
if (($comparison == "CONTAINS" && $pos !== false) ||
|
||||
($comparison == "BEGINS WITH" && $pos === 0) ||
|
||||
($comparison == "ENDS WITH" && $pos === $len_row -
|
||||
$len_search)) {
|
||||
continue;
|
||||
}
|
||||
$checks_passed = false;
|
||||
break;
|
||||
}
|
||||
if ($checks_passed) {
|
||||
$out_data[$name] = $field_data;
|
||||
}
|
||||
}
|
||||
foreach ($search_array as $search_data) {
|
||||
list($column_name, $comparison, $search_value, $sort) =
|
||||
$search_data;
|
||||
if ($sort == "NONE") { continue; }
|
||||
if (isset($args[$column_name])) {
|
||||
$column_name = $args[$column_name];
|
||||
}
|
||||
$values = [];
|
||||
foreach ($out_data as $name => $field_data) {
|
||||
$values[$name] = is_object($field_data) ?
|
||||
$field_data->$column_name:
|
||||
$field_data[$column_name];
|
||||
}
|
||||
$sort = ($sort=="DESC") ? SORT_DESC: SORT_ASC;
|
||||
array_multisort($values, $sort, $out_data);
|
||||
}
|
||||
} else {
|
||||
$out_data = $data[$field_or_model];
|
||||
}
|
||||
$data[$output_field] = array_slice($out_data,
|
||||
$data[$d['START_ROW']], $num_show);
|
||||
}
|
||||
$data[$d['START_ROW']] = min($data[$d['START_ROW']], $num_rows);
|
||||
$data[$d['END_ROW']] = min($data[$d['START_ROW']] + $num_show,
|
||||
$num_rows);
|
||||
if (isset($r['start_row'])) {
|
||||
$data[$d['END_ROW']] = max($data[$d['START_ROW']],
|
||||
min($this->clean($r['end_row'],"int"), $num_rows));
|
||||
}
|
||||
$data[$d['NEXT_START']] = $data[$d['END_ROW']];
|
||||
$data[$d['NEXT_END']] = min($data[$d['NEXT_START']] + $num_show,
|
||||
$num_rows);
|
||||
$data[$d['PREV_START']] = max(0, $data[$d['START_ROW']] - $num_show);
|
||||
$data[$d['PREV_END']] = $data[$d['START_ROW']];
|
||||
$data[$d['NUM_TOTAL']] = $num_rows;
|
||||
}
|
||||
/**
|
||||
* Used to invoke an activity method of the current controller or one
|
||||
* its components
|
||||
*
|
||||
* @param $activity method to invoke
|
||||
*/
|
||||
public function call($activity)
|
||||
{
|
||||
if (isset($this->activity_component[$activity])) {
|
||||
return $this->component(
|
||||
$this->activity_component[$activity])->$activity();
|
||||
}
|
||||
return $this->$activity();
|
||||
}
|
||||
/**
|
||||
* Generates a cross site request forgery preventing token based on the
|
||||
* provided user name, the current time and the hidden AUTH_KEY
|
||||
*
|
||||
* @param string $user username to use to generate token
|
||||
* @return string a csrf token
|
||||
*/
|
||||
public function generateCSRFToken($user)
|
||||
{
|
||||
$time = time();
|
||||
$_SESSION['OLD_CSRF_TIME'] = (isset($_SESSION['CSRF_TIME'])) ?
|
||||
$_SESSION['CSRF_TIME'] : 0;
|
||||
$_SESSION['CSRF_TIME'] = $time;
|
||||
return L\crawlHash($user.$time . C\AUTH_KEY)."*$time";
|
||||
}
|
||||
/**
|
||||
* Checks if the form CSRF (cross-site request forgery preventing) token
|
||||
* matches the given user and has not expired (1 hour till expires)
|
||||
*
|
||||
* @param string $token_name attribute of $_REQUEST containing CSRFToken
|
||||
* @param string $user user id
|
||||
* @return bool whether the CSRF token was valid
|
||||
*/
|
||||
public function checkCSRFToken($token_name, $user)
|
||||
{
|
||||
$token_okay = false;
|
||||
if (isset($_REQUEST[$token_name]) &&
|
||||
strlen($_REQUEST[$token_name]) == 22) {
|
||||
$token_parts = explode("*", $_REQUEST[$token_name]);
|
||||
if (isset($token_parts[1]) &&
|
||||
$token_parts[1] + C\ONE_HOUR > time() &&
|
||||
L\crawlHash($user . $token_parts[1] . C\AUTH_KEY) ==
|
||||
$token_parts[0]) {
|
||||
$token_okay = true;
|
||||
}
|
||||
}
|
||||
return $token_okay;
|
||||
}
|
||||
/**
|
||||
* Checks if the timestamp in $_REQUEST[$token_name]
|
||||
* matches the timestamp of the last CSRF token accessed by this user
|
||||
* for the kind of activity for which there might be a conflict.
|
||||
* This is to avoid accidental replays of postings etc if the back button
|
||||
* used.
|
||||
*
|
||||
* @param string $token_name name of a $_REQUEST field used to hold a
|
||||
* CSRF_TOKEN
|
||||
* @param string $action name of current action to check for conflicts
|
||||
* @return bool whether a conflicting action has occurred.
|
||||
*/
|
||||
public function checkCSRFTime($token_name, $action = "")
|
||||
{
|
||||
$token_okay = false;
|
||||
if (isset($_REQUEST[$token_name])) {
|
||||
$token_parts = explode("*", $_REQUEST[$token_name]);
|
||||
if (isset($token_parts[1])) {
|
||||
$timestamp_to_check = $token_parts[1];
|
||||
if ($action == "") {
|
||||
if (isset($_SESSION['OLD_CSRF_TIME']) &&
|
||||
$token_parts[1] == $_SESSION['OLD_CSRF_TIME']) {
|
||||
$token_okay = true;
|
||||
}
|
||||
} else {
|
||||
if (!isset($_SESSION['OLD_ACTION_STAMPS'][$action]) ||
|
||||
(isset($_SESSION['OLD_ACTION_STAMPS'][$action]) &&
|
||||
$_SESSION['OLD_ACTION_STAMPS'][$action] <=
|
||||
$timestamp_to_check)) {
|
||||
$_SESSION['OLD_ACTION_STAMPS'][$action] =
|
||||
$timestamp_to_check;
|
||||
$token_okay = true;
|
||||
$cull_time = time() - C\ONE_HOUR;
|
||||
foreach ($_SESSION['OLD_ACTION_STAMPS'] as $act =>
|
||||
$time) {
|
||||
if ($time < $cull_time) {
|
||||
unset($_SESSION['OLD_ACTION_STAMPS'][$act]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return $token_okay;
|
||||
}
|
||||
/**
|
||||
* Used to clean strings that might be tainted as originate from the user
|
||||
*
|
||||
* @param mixed $value tainted data
|
||||
* @param mixed $type type of data in value can be one of the following
|
||||
* strings: bool, color, double, float, int, hash, or string, web-url;
|
||||
* or it can be an array listing allowed values. If the latter, then
|
||||
* if the value is not in the array the cleaned value will be first
|
||||
* element of the array if $default is null
|
||||
* @param mixed $default if $value is not set default value is returned,
|
||||
* this isn't used much since if the error_reporting is E_ALL
|
||||
* or -1 you would still get a Notice.
|
||||
* @return string the clean input matching the type provided
|
||||
*/
|
||||
public function clean($value, $type, $default = null)
|
||||
{
|
||||
$clean_value = null;
|
||||
if (is_array($type)) {
|
||||
if (in_array($value, $type)) {
|
||||
return $value;
|
||||
} else {
|
||||
if ($default != null) {
|
||||
return $default;
|
||||
}
|
||||
reset($type);
|
||||
return current($type);
|
||||
}
|
||||
}
|
||||
switch ($type) {
|
||||
case "boolean":
|
||||
// no break
|
||||
case "bool":
|
||||
if (isset($value)) {
|
||||
if (!is_bool($value)) {
|
||||
$clean_value = false;
|
||||
if ($value == "true" || $value != 0) {
|
||||
$clean_value = true;
|
||||
}
|
||||
}
|
||||
} else if ($default != null) {
|
||||
$clean_value = $default;
|
||||
} else {
|
||||
$clean_value = false;
|
||||
}
|
||||
break;
|
||||
case "color":
|
||||
if (isset($value)) {
|
||||
$colors = ["black", "silver", "gray", "white",
|
||||
"maroon", "red", "purple", "fuchsia", "green", "lime",
|
||||
"olive", "yellow", "navy", "blue", "teal", "aqua",
|
||||
"orange", "aliceblue", "antiquewhite", "aquamarine",
|
||||
"azure", "beige", "bisque", "blanchedalmond",
|
||||
"blueviolet", "brown", "burlywood", "cadetblue",
|
||||
"chartreuse", "chocolate", "coral", "cornflowerblue",
|
||||
"cornsilk", "crimson", "darkblue", "darkcyan",
|
||||
"darkgoldenrod", "darkgray", "darkgreen", "darkgrey",
|
||||
"darkkhaki", "darkmagenta", "darkolivegreen",
|
||||
"darkorange", "darkorchid", "darkred", "darksalmon",
|
||||
"darkseagreen", "darkslateblue", "darkslategray",
|
||||
"darkslategrey", "darkturquoise", "darkviolet",
|
||||
"deeppink", "deepskyblue", "dimgray", "dodgerblue",
|
||||
"firebrick", "floralwhite", "forestgreen", "gainsboro",
|
||||
"ghostwhite", "gold", "goldenrod", "greenyellow",
|
||||
"grey", "honeydew", "hotpink", "indianred", "indigo",
|
||||
"ivory", "khaki", "lavender", "lavenderblush",
|
||||
"lawngreen", "lemonchiffon", "lightblue", "lightcoral",
|
||||
"lightcyan", "lightgoldenrodyellow", "lightgray",
|
||||
"lightgreen", "lightgrey", "lightpink", "lightsalmon",
|
||||
"lightseagreen", "lightskyblue", "lightslategray",
|
||||
"lightslategrey", "lightsteelblue", "lightyellow",
|
||||
"limegreen", "linen", "mediumaquamarine",
|
||||
"mediumblue", "mediumorchid", "mediumpurple",
|
||||
"mediumseagreen", "mediumslateblue",
|
||||
"mediumspringgreen", "mediumturquoise",
|
||||
"mediumvioletred", "midnightblue", "mintcream",
|
||||
"mistyrose", "moccasin", "navajowhite", "oldlace",
|
||||
"olivedrab", "orangered", "orchid", "palegoldenrod",
|
||||
"palegreen", "paleturquoise", "palevioletred",
|
||||
"papayawhip", "peachpuff", "peru", "pink", "plum",
|
||||
"powderblue", "rosybrown", "royalblue", "saddlebrown",
|
||||
"salmon", "sandybrown", "seagreen", "seashell",
|
||||
"sienna", "skyblue", "slateblue", "slategray",
|
||||
"slategrey", "snow", "springgreen", "steelblue",
|
||||
"tan", "thistle", "tomato", "turquoise", "violet",
|
||||
"wheat", "whitesmoke", "yellowgreen", "rebeccapurple"
|
||||
];
|
||||
if (in_array($value, $colors)
|
||||
|| preg_match('/^#[a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]'.
|
||||
'([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9])?$/',
|
||||
trim($value))) {
|
||||
$clean_value = trim($value);
|
||||
} else {
|
||||
$clean_value = "#FFF";
|
||||
}
|
||||
} else if ($default != null) {
|
||||
$clean_value = $default;
|
||||
} else {
|
||||
$clean_value = "#FFF";
|
||||
}
|
||||
break;
|
||||
case "double":
|
||||
if (isset($value)) {
|
||||
$clean_value = doubleval($value);
|
||||
} else if ($default != null) {
|
||||
$clean_value = $default;
|
||||
} else {
|
||||
$clean_value = 0;
|
||||
}
|
||||
break;
|
||||
case "float":
|
||||
if (isset($value)) {
|
||||
$clean_value = floatval($value);
|
||||
} else if ($default != null) {
|
||||
$clean_value = $default;
|
||||
} else {
|
||||
$clean_value = 0;
|
||||
}
|
||||
break;
|
||||
case "file_name":
|
||||
if (isset($value)) {
|
||||
$value = str_replace("&", "&", $value);
|
||||
$value = str_replace("/", "", $value);
|
||||
$value = str_replace("\\", "", $value);
|
||||
$value = str_replace("*", "", $value);
|
||||
$clean_value = str_replace(":", "", $value);
|
||||
} else {
|
||||
$clean_value = $default;
|
||||
}
|
||||
break;
|
||||
case "hash";
|
||||
if (isset($value)) {
|
||||
if (strlen($value) == strlen(L\crawlHash("A")) &&
|
||||
base64_decode($value)) {
|
||||
$clean_value = $value;
|
||||
}
|
||||
} else {
|
||||
$clean_value = $default;
|
||||
}
|
||||
break;
|
||||
case "int":
|
||||
if (isset($value)) {
|
||||
$clean_value = intval($value);
|
||||
} else if ($default != null) {
|
||||
$clean_value = $default;
|
||||
} else {
|
||||
$clean_value = 0;
|
||||
}
|
||||
break;
|
||||
case "string":
|
||||
if (isset($value)) {
|
||||
$value2 = str_replace("&", "&", $value);
|
||||
// -CP REMEMBER TO CK THIS!!!!!
|
||||
$value2 = mb_convert_encoding($value2, "UTF-8");
|
||||
$clean_value = @htmlentities($value2, ENT_QUOTES, "UTF-8");
|
||||
} else {
|
||||
$clean_value = $default;
|
||||
}
|
||||
break;
|
||||
case 'web-url':
|
||||
if (isset($value)) {
|
||||
$value = trim($value);
|
||||
$start = substr($value, 0, 4);
|
||||
$is_web = ($start == 'http');
|
||||
if (!$is_web && $start != "goph") {
|
||||
$value = "http://$value";
|
||||
$is_web = true;
|
||||
}
|
||||
if ($is_web) {
|
||||
$value = str_replace("&", "&", $value);
|
||||
$clean_value = @htmlentities($value, ENT_QUOTES,
|
||||
"UTF-8");
|
||||
}
|
||||
$clean_value = UrlParser::canonicalLink($clean_value,
|
||||
$default, false);
|
||||
} else {
|
||||
$clean_value = $default;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return $clean_value;
|
||||
}
|
||||
/**
|
||||
* Converts an array of lines of strings into a single string with
|
||||
* proper newlines, each line having been trimmed and potentially
|
||||
* cleaned
|
||||
*
|
||||
* @param array $arr the array of lines to be process
|
||||
* @param string $endline_string what string should be used to indicate
|
||||
* the end of a line
|
||||
* @param bool $clean whether to clean each line
|
||||
* @return string a concatenated string of cleaned lines
|
||||
*/
|
||||
public function convertArrayLines($arr, $endline_string="\n",
|
||||
$clean = false)
|
||||
{
|
||||
$output = "";
|
||||
$eol = "";
|
||||
foreach ($arr as $line) {
|
||||
$output .= $eol;
|
||||
$out_line = trim($line);
|
||||
if ($clean) {
|
||||
$out_line = $this->clean($out_line, "string");
|
||||
}
|
||||
$output .= trim($out_line);
|
||||
$eol = $endline_string;
|
||||
}
|
||||
return $output;
|
||||
}
|
||||
/**
|
||||
* Cleans a string consisting of lines, typically of urls into an array of
|
||||
* clean lines. This is used in handling data from the crawl options
|
||||
* text areas. # is treated as a comment
|
||||
*
|
||||
* @param string $str contains the url data
|
||||
* @param string $line_type does additional cleaning depending on the type
|
||||
* of the lines. For instance, if is "url" then a line not beginning
|
||||
* with a url scheme will have http:// prepended.
|
||||
* @return $lines an array of clean lines
|
||||
*/
|
||||
public function convertStringCleanArray($str, $line_type="url")
|
||||
{
|
||||
$pre_lines = preg_split('/\n+/', $str);
|
||||
$lines = [];
|
||||
foreach ($pre_lines as $line) {
|
||||
$pre_line = trim($this->clean($line, "string"));
|
||||
if (strlen($pre_line) > 0) {
|
||||
if ($line_type == "url") {
|
||||
$start_line = substr($pre_line, 0, 6);
|
||||
if (!in_array($start_line,
|
||||
["file:/", "http:/", "domain", "https:", 'gopher']) &&
|
||||
$start_line[0] != "#") {
|
||||
$pre_line = "http://". $pre_line;
|
||||
}
|
||||
}
|
||||
$lines[] = $pre_line;
|
||||
}
|
||||
}
|
||||
return $lines;
|
||||
}
|
||||
/**
|
||||
* Checks the request if a request is for a valid activity and if it uses
|
||||
* the correct authorization key
|
||||
*
|
||||
* @return bool whether the request was valid or not
|
||||
*/
|
||||
public function checkRequest()
|
||||
{
|
||||
if (empty($_REQUEST['time']) ||
|
||||
empty($_REQUEST['session']) || (
|
||||
!in_array($_REQUEST['a'], $this->activities)
|
||||
&& $_REQUEST['c'] != 'jobs' )) { return false; }
|
||||
$time = $_REQUEST['time'];
|
||||
// request must be within an hour of this machine's clock
|
||||
if (abs(time() - $time) > C\ONE_HOUR) { return false;}
|
||||
$session = $_REQUEST['session'];
|
||||
if (md5($time . C\AUTH_KEY) != $session) { return false; }
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Used to set up the head variables for and page_data of a wiki or static
|
||||
* page associated with a view.
|
||||
*
|
||||
* @param object $view View on which page data will be rendered
|
||||
* @param string $page_name a string name/id to associate with page. For
|
||||
* example, might have 404 for a page about 404 errors
|
||||
* @param string $page_data this is the actual content of a wiki or
|
||||
* static page
|
||||
*/
|
||||
public function parsePageHeadVarsView($view, $page_name, $page_data)
|
||||
{
|
||||
list($view->head_objects[$page_name], $view->page_objects[$page_name])=
|
||||
$this->parsePageHeadVars($page_data, true);
|
||||
}
|
||||
/**
|
||||
* Used to parse head meta variables out of a data string provided either
|
||||
* from a wiki page or a static page. Meta data is stored in lines
|
||||
* before the first occurrence of END_HEAD_VARS. Head variables
|
||||
* are name=value pairs. An example of head
|
||||
* variable might be:
|
||||
* title = This web page's title
|
||||
* Anything after a semi-colon on a line in the head section is treated as
|
||||
* a comment
|
||||
*
|
||||
* @param string $page_data this is the actual content of a wiki or
|
||||
* static page
|
||||
* @param bool whether to output just an array of head variables or
|
||||
* if output a pair [head vars, page body]
|
||||
* @return array the associative array of head variables or pair
|
||||
* [head vars, page body]
|
||||
*/
|
||||
public function parsePageHeadVars($page_data, $with_body = false)
|
||||
{
|
||||
$page_parts = explode("END_HEAD_VARS", $page_data);
|
||||
$head_object = [];
|
||||
if (count($page_parts) > 1) {
|
||||
$head_lines = preg_split("/\n\n/", array_shift($page_parts));
|
||||
$page_data = implode("END_HEAD_VARS", $page_parts);
|
||||
foreach ($head_lines as $line) {
|
||||
$semi_pos = (strpos($line, ";")) ? strpos($line, ";"):
|
||||
strlen($line);
|
||||
$line = substr($line, 0, $semi_pos);
|
||||
$line_parts = explode("=", $line);
|
||||
if (count($line_parts) == 2) {
|
||||
$head_object[trim(urldecode($line_parts[0]))] =
|
||||
urldecode(trim($line_parts[1]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$page_data = $page_parts[0];
|
||||
}
|
||||
if ($with_body) {
|
||||
return [$head_object, $page_data];
|
||||
}
|
||||
return $head_object;
|
||||
}
|
||||
/**
|
||||
* If external source advertisements are present in the output of this
|
||||
* controller this function can be used to initialize the field variables
|
||||
* used to write the appropriate Javascripts
|
||||
*
|
||||
* @param array& $data data to be used in drawing the view
|
||||
* @param bool $ads_off whether or not ads are turned off so that this
|
||||
* method should do nothing
|
||||
*/
|
||||
public function initializeAdFields(&$data, $ads_off = false)
|
||||
{
|
||||
if (C\AD_LOCATION != "none") {
|
||||
$data["AD_LOCATION"] = ($ads_off) ? "none" : C\AD_LOCATION;
|
||||
$ad_fields = ['TOP_ADSCRIPT', 'SIDE_ADSCRIPT', 'GLOBAL_ADSCRIPT'];
|
||||
foreach ($ad_fields as $ad_field) {
|
||||
$ad = html_entity_decode(constant(C\NS_CONFIGS . $ad_field),
|
||||
ENT_QUOTES);
|
||||
$ad = preg_replace("[(]","(",$ad);
|
||||
$data[$ad_field] = preg_replace("[)]",")",$ad);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Adds to an integer, $actual_value, epsilon-noise taken from an
|
||||
* L_1 gaussian source to centered at $actual_value to get a epsilon
|
||||
* private, integre value.
|
||||
*
|
||||
* @param int $actual_value number want to make private
|
||||
* @return int $fuzzy_value number after noise added
|
||||
*/
|
||||
public function addDifferentialPrivacy($actual_value)
|
||||
{
|
||||
$sigma = 1 / C\PRIVACY_EPSILON;
|
||||
$max_value = (2 * $actual_value) + 1;
|
||||
/* Calculation by Integration
|
||||
* f: exp(-1 * ((abs($actual_value - $x)) / $sigma))
|
||||
* Since function consists of absolute value, break the integration
|
||||
* into two to remove absolute from the function.
|
||||
* First integral runs from 0 through $actual_value, and second
|
||||
* integral runs from $actual_value through $max_value.
|
||||
* Then after using substition rule, first and second integral
|
||||
* range change, say [a,b] and [b,c]
|
||||
*/
|
||||
$a = -1 * $actual_value;
|
||||
$b = 0;
|
||||
$c = -1 * ($max_value - $actual_value);
|
||||
$integral_value = $sigma * (2 - exp($a / $sigma) - exp($c / $sigma));
|
||||
$random = rand(0, $integral_value);
|
||||
$p = 0;
|
||||
for ($i = 0; $i < $max_value; $i++) {
|
||||
$arg = -1 * (($actual_value - $i) / $sigma);
|
||||
$p += exp($arg);
|
||||
if ($p > $random) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
$fuzzy_value = $i;
|
||||
return $fuzzy_value;
|
||||
}
|
||||
}
|
328
src/controllers/CrawlController.php
Normal file
328
src/controllers/CrawlController.php
Normal file
|
@ -0,0 +1,328 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\MediaConstants;
|
||||
use seekquarry\yioop\library\UrlParser;
|
||||
|
||||
/**
|
||||
* Controller used to manage networked installations of Yioop where
|
||||
* there might be mutliple queue_servers and a name_server. Command
|
||||
* sent to the nameserver web page are mapped out to queue_servers
|
||||
* using this controller. Each method of the controller essentially
|
||||
* mimics one method of CrawlModel, PhraseModel, or in general anything
|
||||
* that extends ParallelModel and is used to proxy that information
|
||||
* through a result web page back to the name_server.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class CrawlController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* These are the activities supported by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["countWords","clearQuerySavePoint",
|
||||
"crawlStalled", "crawlStatus", "deleteCrawl", "injectUrlsCurrentCrawl",
|
||||
"combinedCrawlInfo", "getInfoTimestamp", "getCrawlItems",
|
||||
"getCrawlList", "getCrawlSeedInfo", "sendStartCrawlMessage",
|
||||
"sendStopCrawlMessage", "setCrawlSeedInfo",
|
||||
];
|
||||
/**
|
||||
* Checks that the request seems to be coming from a legitimate fetcher then
|
||||
* determines which activity the fetcher is requesting and calls that
|
||||
* activity for processing.
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
/* do a quick test to see if this is a request seems like
|
||||
from a legitimate machine
|
||||
*/
|
||||
if (!$this->checkRequest()) {return; }
|
||||
$activity = $_REQUEST['a'];
|
||||
if (in_array($activity, $this->activities)) {
|
||||
$this->call($activity);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Handles a request for whether or not the crawl is stalled on the
|
||||
* given local server (which means no fetcher has spoken to it in a while)
|
||||
* outputs this info back as body of the http response (url encoded,
|
||||
* serialized php data)
|
||||
*/
|
||||
public function crawlStalled()
|
||||
{
|
||||
echo L\webencode(serialize($this->model("crawl")->crawlStalled()));
|
||||
}
|
||||
/**
|
||||
* Handles a request for the crawl status (memory use, recent fetchers
|
||||
* crawl rate, etc) data from a remote name server
|
||||
* and retrieves that the statistic about this that are held by the
|
||||
* local queue server
|
||||
* outputs this info back as body of the http response (url encoded,
|
||||
* serialized php data)
|
||||
*/
|
||||
public function crawlStatus()
|
||||
{
|
||||
echo L\webencode(serialize($this->model("crawl")->crawlStatus()));
|
||||
}
|
||||
/**
|
||||
* Handles a request for the starting parameters of a crawl of a given
|
||||
* timestamp and retrieves that information from the bundle held by the
|
||||
* local queue server
|
||||
* outputs this info back as body of the http response (url encoded,
|
||||
* serialized php data)
|
||||
*/
|
||||
public function getCrawlSeedInfo()
|
||||
{
|
||||
$timestamp = 0;
|
||||
if (isset($_REQUEST["arg"]) ) {
|
||||
$timestamp = unserialize(L\webdecode($_REQUEST["arg"]));
|
||||
$timestamp = substr($this->clean($timestamp, "int"), 0,
|
||||
C\TIMESTAMP_LEN);
|
||||
}
|
||||
echo L\webencode(serialize($this->model("crawl")->getCrawlSeedInfo(
|
||||
$timestamp)));
|
||||
}
|
||||
/**
|
||||
* Handles a request to change the parameters of a crawl of a given
|
||||
* timestamp on the local machine (does nothing if crawl doesn't exist)
|
||||
*/
|
||||
public function setCrawlSeedInfo()
|
||||
{
|
||||
if (isset($_REQUEST["arg"]) ) {
|
||||
list($timestamp, $info) = unserialize(
|
||||
L\webdecode($_REQUEST["arg"]));
|
||||
$timestamp = substr($this->clean($timestamp, "int"), 0,
|
||||
C\TIMESTAMP_LEN);
|
||||
if ($timestamp && $info) {
|
||||
$this->model("crawl")->setCrawlSeedInfo($timestamp, $info);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Handles a request for information about a crawl with a given timestamp
|
||||
* from a remote name server and retrieves statistics about this crawl
|
||||
* that are held by the local queue server (number of pages, name, etc)
|
||||
* outputs this info back as body of the http response (url encoded,
|
||||
* serialized php data)
|
||||
*/
|
||||
public function getInfoTimestamp()
|
||||
{
|
||||
$timestamp = 0;
|
||||
if (isset($_REQUEST["arg"]) ) {
|
||||
$timestamp = unserialize(L\webdecode($_REQUEST["arg"]));
|
||||
$timestamp = substr($this->clean($timestamp, "int"), 0,
|
||||
C\TIMESTAMP_LEN);
|
||||
}
|
||||
echo L\webencode(serialize($this->model("crawl")->getInfoTimestamp(
|
||||
$timestamp)));
|
||||
}
|
||||
/**
|
||||
* Handles a request for the crawl list (what crawl are stored on the
|
||||
* machine) data from a remote name server and retrieves the
|
||||
* statistic about this that are held by the local queue server
|
||||
* outputs this info back as body of the http response (url encoded,
|
||||
* serialized php data)
|
||||
*/
|
||||
public function getCrawlList()
|
||||
{
|
||||
$return_arc_bundles = false;
|
||||
$return_recrawls = false;
|
||||
if (isset($_REQUEST["arg"]) ) {
|
||||
$arg = trim(L\webdecode($_REQUEST["arg"]));
|
||||
$arg = $this->clean($arg, "int");
|
||||
if ($arg == 3 || $arg == 1) {$return_arc_bundles = true; }
|
||||
if ($arg == 3 || $arg == 2) {$return_recrawls = true; }
|
||||
}
|
||||
echo L\webencode(serialize($this->model("crawl")->getCrawlList(
|
||||
$return_arc_bundles, $return_recrawls)));
|
||||
}
|
||||
/**
|
||||
* Handles a request for the combined crawl list, stalled, and status
|
||||
* data from a remote name server and retrieves that the statistic about
|
||||
* this that are held by the local queue server
|
||||
* outputs this info back as body of the http response (url encoded,
|
||||
* serialized php data)
|
||||
*/
|
||||
public function combinedCrawlInfo()
|
||||
{
|
||||
$combined = $this->model("crawl")->combinedCrawlInfo();
|
||||
echo L\webencode(serialize($combined));
|
||||
}
|
||||
/**
|
||||
* Receives a request to delete a crawl from a remote name server
|
||||
* and then deletes crawl on the local queue server
|
||||
*/
|
||||
public function deleteCrawl()
|
||||
{
|
||||
if (!isset($_REQUEST["arg"]) ) {
|
||||
return;
|
||||
}
|
||||
$timestamp = substr($this->clean($timestamp, "int"), 0,
|
||||
C\TIMESTAMP_LEN);
|
||||
$timestamp = unserialize(L\webdecode($_REQUEST["arg"]));
|
||||
$this->model("crawl")->deleteCrawl($timestamp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Receives a request to inject new urls into the active
|
||||
* crawl from a remote name server and then does this for
|
||||
* the local queue server
|
||||
*/
|
||||
public function injectUrlsCurrentCrawl()
|
||||
{
|
||||
if (!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"])
|
||||
|| !isset($_REQUEST["i"])) {
|
||||
return;
|
||||
}
|
||||
$num = $this->clean($_REQUEST["num"], "int");
|
||||
$i = $this->clean($_REQUEST["i"], "int");
|
||||
list($timestamp, $inject_urls) =
|
||||
unserialize(L\webdecode($_REQUEST["arg"]));
|
||||
$timestamp = substr($this->clean($timestamp, "int"), 0,
|
||||
C\TIMESTAMP_LEN);
|
||||
$inject_urls = L\partitionByHash($inject_urls,
|
||||
null, $num, $i, C\NS_LIB . "UrlParser::getHost");
|
||||
$this->model("crawl")->injectUrlsCurrentCrawl($timestamp,
|
||||
$inject_urls, null);
|
||||
}
|
||||
/**
|
||||
* Receives a request to get crawl summary data for an array of urls
|
||||
* from a remote name server and then looks these up on the local
|
||||
* queue server
|
||||
*/
|
||||
public function getCrawlItems()
|
||||
{
|
||||
$crawl_model = $this->model("crawl");
|
||||
$start_time = microtime(true);
|
||||
if (!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"])
|
||||
|| !isset($_REQUEST["i"])) {
|
||||
return;
|
||||
}
|
||||
$num = $this->clean($_REQUEST["num"], "int");
|
||||
$i = $this->clean($_REQUEST["i"], "int");
|
||||
$crawl_model->current_machine = $i;
|
||||
$lookups = unserialize(L\webdecode($_REQUEST["arg"]));
|
||||
$our_lookups = [];
|
||||
foreach ($lookups as $lookup => $lookup_info) {
|
||||
if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h'
|
||||
|| $lookup_info[0][0] === 'r'
|
||||
|| $lookup_info[0][0] === 'g')) {
|
||||
$our_lookups[$lookup] = $lookup_info;
|
||||
} else {
|
||||
$our_lookups[$lookup] = [];
|
||||
foreach ($lookup_info as $lookup_item) {
|
||||
if (count($lookup_item) == 2) {
|
||||
$our_lookups[$lookup][] = $lookup_item;
|
||||
} else {
|
||||
list($index, , , , ) = $lookup_item;
|
||||
if ($index == $i) {
|
||||
$our_lookups[$lookup][] = $lookup_item;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
$items = $crawl_model->getCrawlItems($our_lookups);
|
||||
$items["ELAPSED_TIME"] = L\changeInMicrotime($start_time);
|
||||
header("Content-Type: text/plain");
|
||||
$items = L\webencode(serialize($items));
|
||||
header("Content-Length: ".strlen($items));
|
||||
echo $items;
|
||||
flush();
|
||||
}
|
||||
/**
|
||||
* Receives a request to get counts of the number of occurrences of an
|
||||
* array of words a remote name server and then
|
||||
* determines and outputs these counts for the local queue server
|
||||
*/
|
||||
public function countWords()
|
||||
{
|
||||
if (!isset($_REQUEST["arg"]) ) {
|
||||
return;
|
||||
}
|
||||
$crawl_model = $this->model("crawl");
|
||||
list($words, $index_name) = unserialize(L\webdecode($_REQUEST["arg"]));
|
||||
$crawl_model->index_name = $index_name;
|
||||
echo L\webencode(serialize(
|
||||
$crawl_model->countWords($words)));
|
||||
}
|
||||
/**
|
||||
* Receives a request to stop a crawl from a remote name server
|
||||
* and then stop the current crawl on the local queue server
|
||||
*/
|
||||
public function sendStopCrawlMessage()
|
||||
{
|
||||
$this->model("crawl")->sendStopCrawlMessage();
|
||||
}
|
||||
/**
|
||||
* Receives a request to start a crawl from a remote name server
|
||||
* and then starts the crawl process on the local queue server
|
||||
*/
|
||||
public function sendStartCrawlMessage()
|
||||
{
|
||||
if (!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"])
|
||||
|| !isset($_REQUEST["i"])) {
|
||||
return;
|
||||
}
|
||||
$num = $this->clean($_REQUEST["num"], "int");
|
||||
$i = $this->clean($_REQUEST["i"], "int");
|
||||
list($crawl_params,
|
||||
$seed_info) = unserialize(L\webdecode($_REQUEST["arg"]));
|
||||
$seed_info['seed_sites']['url'] =
|
||||
L\partitionByHash($seed_info['seed_sites']['url'],
|
||||
null, $num, $i, C\NS_LIB . "UrlParser::getHost");
|
||||
$this->model("crawl")->sendStartCrawlMessage($crawl_params, $seed_info,
|
||||
null);
|
||||
}
|
||||
/**
|
||||
* A save point is used to store to disk a sequence generation-doc-offset
|
||||
* pairs of a particular mix query when doing an archive crawl of a crawl
|
||||
* mix. This is used so that the mix can remember where it was the next
|
||||
* time it is invoked by the web app on the machine in question.
|
||||
* This function deletes such a save point associated with a timestamp
|
||||
*/
|
||||
public function clearQuerySavePoint()
|
||||
{
|
||||
if (!isset($_REQUEST["arg"])) {
|
||||
return;
|
||||
}
|
||||
$save_timestamp = substr($this->clean($_REQUEST["arg"], "int"), 0,
|
||||
C\TIMESTAMP_LEN);
|
||||
$this->model("crawl")->clearQuerySavePoint($save_timestamp);
|
||||
}
|
||||
}
|
||||
|
643
src/controllers/FetchController.php
Normal file
643
src/controllers/FetchController.php
Normal file
|
@ -0,0 +1,643 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\classifiers\Classifier;
|
||||
|
||||
// to allow the calulation of longer archive schedules
|
||||
ini_set('max_execution_time', 60);
|
||||
/**
|
||||
* This class handles data coming to a queue_server from a fetcher
|
||||
* Basically, it receives the data from the fetcher and saves it into
|
||||
* various files for later processing by the queue server.
|
||||
* This class can also be used by a fetcher to get status information.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class FetchController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* These are the activities supported by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["schedule", "archiveSchedule", "update", "crawlTime"];
|
||||
/**
|
||||
* Number of seconds that must elapse after last call before doing
|
||||
* cron activities (mainly check liveness of fetchers which should be
|
||||
* alive)
|
||||
*/
|
||||
const CRON_INTERVAL = 300;
|
||||
/**
|
||||
* Checks that the request seems to be coming from a legitimate fetcher then
|
||||
* determines which activity the fetcher is requesting and calls that
|
||||
* activity for processing.
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
/* do a quick test to see if this is a request seems like
|
||||
from a legitimate machine
|
||||
*/
|
||||
if (!$this->checkRequest()) {return; }
|
||||
$activity = $_REQUEST['a'];
|
||||
$robot_table_name = C\CRAWL_DIR."/".self::robot_table_name;
|
||||
$robot_table = [];
|
||||
if (file_exists($robot_table_name)) {
|
||||
$robot_table = unserialize(file_get_contents($robot_table_name));
|
||||
}
|
||||
if (isset($_REQUEST['robot_instance']) &&
|
||||
(isset($_REQUEST['machine_uri']))) {
|
||||
$robot_table[$this->clean($_REQUEST['robot_instance'], "string")] =
|
||||
[$_SERVER['REMOTE_ADDR'],
|
||||
$this->clean($_REQUEST['machine_uri'], "string"),
|
||||
time()];
|
||||
file_put_contents($robot_table_name, serialize($robot_table),
|
||||
LOCK_EX);
|
||||
}
|
||||
if (in_array($activity, $this->activities)) {
|
||||
$this->call($activity);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Checks if there is a schedule of sites to crawl available and
|
||||
* if so present it to the requesting fetcher, and then delete it.
|
||||
*/
|
||||
public function schedule()
|
||||
{
|
||||
$view = "fetch";
|
||||
// set up query
|
||||
$data = [];
|
||||
if (isset($_REQUEST['crawl_time'])) {;
|
||||
$crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'),
|
||||
0, C\TIMESTAMP_LEN);
|
||||
} else {
|
||||
$crawl_time = 0;
|
||||
}
|
||||
$schedule_filename = C\CRAWL_DIR."/schedules/".
|
||||
self::schedule_name."$crawl_time.txt";
|
||||
if (file_exists($schedule_filename)) {
|
||||
$data['MESSAGE'] = file_get_contents($schedule_filename);
|
||||
unlink($schedule_filename);
|
||||
} else {
|
||||
/* check if scheduler part of queue server went down
|
||||
and needs to be restarted with current crawl time.
|
||||
Idea is fetcher has recently spoken with name server
|
||||
so knows the crawl time. queue server knows time
|
||||
only by file messages never by making curl requests
|
||||
*/
|
||||
$this->checkRestart(self::WEB_CRAWL);
|
||||
$info = [];
|
||||
$info[self::STATUS] = self::NO_DATA_STATE;
|
||||
$data['MESSAGE'] = base64_encode(serialize($info))."\n";
|
||||
}
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* Checks to see whether there are more pages to extract from the current
|
||||
* archive, and if so returns the next batch to the requesting fetcher. The
|
||||
* iteration progress is automatically saved on each call to nextPages, so
|
||||
* that the next fetcher will get the next batch of pages. If there is no
|
||||
* current archive to iterate over, or the iterator has reached the end of
|
||||
* the archive then indicate that there is no more data by setting the
|
||||
* status to NO_DATA_STATE.
|
||||
*/
|
||||
public function archiveSchedule()
|
||||
{
|
||||
$view = "fetch";
|
||||
$request_start = time();
|
||||
if (isset($_REQUEST['crawl_time'])) {;
|
||||
$crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'),
|
||||
0, C\TIMESTAMP_LEN);
|
||||
} else {
|
||||
$crawl_time = 0;
|
||||
}
|
||||
$messages_filename = C\CRAWL_DIR.'/schedules/NameServerMessages.txt';
|
||||
$lock_filename = C\WORK_DIRECTORY."/schedules/NameServerLock.txt";
|
||||
if ($crawl_time > 0 && file_exists($messages_filename)) {
|
||||
$fetch_pages = true;
|
||||
$info = unserialize(file_get_contents($messages_filename));
|
||||
if ($info[self::STATUS] == 'STOP_CRAWL') {
|
||||
/* The stop crawl message gets created by the admin_controller
|
||||
when the "stop crawl" button is pressed.*/
|
||||
if (file_exists($messages_filename)) {
|
||||
unlink($messages_filename);
|
||||
}
|
||||
if (file_exists($lock_filename)) {
|
||||
unlink($lock_filename);
|
||||
}
|
||||
$fetch_pages = false;
|
||||
$info = [];
|
||||
}
|
||||
$this->checkRestart(self::ARCHIVE_CRAWL);
|
||||
} else {
|
||||
$fetch_pages = false;
|
||||
$info = [];
|
||||
}
|
||||
$pages = [];
|
||||
$got_lock = true;
|
||||
if (file_exists($lock_filename)) {
|
||||
$lock_time = unserialize(file_get_contents($lock_filename));
|
||||
if ($request_start - $lock_time < ini_get('max_execution_time')){
|
||||
$got_lock = false;
|
||||
}
|
||||
}
|
||||
$chunk = false;
|
||||
$archive_iterator = null;
|
||||
if ($fetch_pages && $got_lock) {
|
||||
file_put_contents($lock_filename, serialize($request_start));
|
||||
if ($info[self::ARC_DIR] == "MIX" ||
|
||||
file_exists($info[self::ARC_DIR])) {
|
||||
$iterate_timestamp = $info[self::CRAWL_INDEX];
|
||||
$result_timestamp = $crawl_time;
|
||||
$result_dir = C\WORK_DIRECTORY.
|
||||
"/schedules/".self::name_archive_iterator.$crawl_time;
|
||||
$arctype = $info[self::ARC_TYPE];
|
||||
$iterator_name = C\NS_ARCHIVE . $arctype."Iterator";
|
||||
try {
|
||||
if ($info[self::ARC_DIR] == "MIX") {
|
||||
//recrawl of crawl mix case
|
||||
$archive_iterator = new $iterator_name(
|
||||
$iterate_timestamp, $result_timestamp);
|
||||
} else {
|
||||
//any other archive crawl except web archive recrawls
|
||||
$archive_iterator = new $iterator_name(
|
||||
$iterate_timestamp, $info[self::ARC_DIR],
|
||||
$result_timestamp, $result_dir);
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
$info['ARCHIVE_BUNDLE_ERROR'] =
|
||||
"Invalid bundle iterator: '{$iterator_name}' \n".
|
||||
$e->getMessage();
|
||||
}
|
||||
}
|
||||
$pages = false;
|
||||
if ($archive_iterator && !$archive_iterator->end_of_iterator) {
|
||||
if (L\generalIsA($archive_iterator,
|
||||
C\NS_ARCHIVE . "TextArchiveBundleIterator")) {
|
||||
$pages = $archive_iterator->nextChunk();
|
||||
$chunk = true;
|
||||
} else {
|
||||
$pages = $archive_iterator->nextPages(
|
||||
C\ARCHIVE_BATCH_SIZE);
|
||||
}
|
||||
}
|
||||
if (file_exists($lock_filename)) {
|
||||
unlink($lock_filename);
|
||||
}
|
||||
}
|
||||
if ($archive_iterator && $archive_iterator->end_of_iterator) {
|
||||
$info[self::END_ITERATOR] = true;
|
||||
}
|
||||
if (($chunk && $pages) || ($pages && !empty($pages))) {
|
||||
$pages_string = L\webencode(gzcompress(serialize($pages)));
|
||||
} else {
|
||||
$info[self::STATUS] = self::NO_DATA_STATE;
|
||||
$info[self::POST_MAX_SIZE] = L\metricToInt(
|
||||
ini_get("post_max_size"));
|
||||
$pages = [];
|
||||
$pages_string = L\webencode(gzcompress(serialize($pages)));
|
||||
}
|
||||
$info[self::DATA] = $pages_string;
|
||||
$info_string = serialize($info);
|
||||
$data['MESSAGE'] = $info_string;
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* Checks if the queue server crawl needs to be restarted
|
||||
* @param string $crawl_type if it does use restart the crawl as a crawl
|
||||
* of this type. For example, self::WEB_CRAWL or self::ARCHIVE_CRAWL
|
||||
*/
|
||||
public function checkRestart($crawl_type)
|
||||
{
|
||||
if (isset($_REQUEST['crawl_time'])) {;
|
||||
$crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'),
|
||||
0, C\TIMESTAMP_LEN);
|
||||
if (isset($_REQUEST['check_crawl_time'])) {
|
||||
$check_crawl_time = substr($this->clean(
|
||||
$_REQUEST['check_crawl_time'], 'int'), 0, C\TIMESTAMP_LEN);
|
||||
}
|
||||
} else {
|
||||
$crawl_time = 0;
|
||||
$check_crawl_time = 0;
|
||||
}
|
||||
$index_schedule_file = C\CRAWL_DIR."/schedules/" .
|
||||
self::index_closed_name . $crawl_time. ".txt";
|
||||
if ($crawl_time > 0 && file_exists($index_schedule_file) &&
|
||||
$check_crawl_time > intval(fileatime($index_schedule_file)) &&
|
||||
!file_exists(C\CRAWL_DIR.
|
||||
"/schedules/QueueServerMessages.txt") ) {
|
||||
$restart = true;
|
||||
if (file_exists(C\CRAWL_DIR."/schedules/crawl_status.txt")) {
|
||||
$crawl_status = unserialize(file_get_contents(
|
||||
C\CRAWL_DIR."/schedules/crawl_status.txt"));
|
||||
if ($crawl_status['CRAWL_TIME'] != 0) {
|
||||
$restart = false;
|
||||
}
|
||||
}
|
||||
if ($restart == true && file_exists(C\CRAWL_DIR.'/cache/'.
|
||||
self::index_data_base_name.$crawl_time)) {
|
||||
$crawl_params = [];
|
||||
$crawl_params[self::STATUS] = "RESUME_CRAWL";
|
||||
$crawl_params[self::CRAWL_TIME] = $crawl_time;
|
||||
$crawl_params[self::CRAWL_TYPE] = $crawl_type;
|
||||
/*
|
||||
we only set crawl time. Other data such as allowed sites
|
||||
should come from index.
|
||||
*/
|
||||
$this->model("crawl")->sendStartCrawlMessage($crawl_params,
|
||||
null, null);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Processes Robot, To Crawl, and Index data sent from a fetcher
|
||||
* Acknowledge to the fetcher if this data was received okay.
|
||||
*/
|
||||
public function update()
|
||||
{
|
||||
$view = "fetch";
|
||||
$info_flag = false;
|
||||
$logging = "";
|
||||
$necessary_fields = ['byte_counts', 'current_part', 'hash_data',
|
||||
'hash_part', 'num_parts', 'part'];
|
||||
$part_flag = true;
|
||||
$missing = "";
|
||||
foreach ($necessary_fields as $field) {
|
||||
if (!isset($_REQUEST[$field])) {
|
||||
$part_flag = false;
|
||||
$missing = $field;
|
||||
}
|
||||
}
|
||||
if (isset($_REQUEST['crawl_type'])) {
|
||||
$this->checkRestart($this->clean(
|
||||
$_REQUEST['crawl_type'], 'string'));
|
||||
}
|
||||
if ($part_flag &&
|
||||
L\crawlHash($_REQUEST['part']) == $_REQUEST['hash_part']) {
|
||||
$upload = false;
|
||||
if (intval($_REQUEST['num_parts']) > 1) {
|
||||
$info_flag = true;
|
||||
if (!file_exists(C\CRAWL_DIR."/temp")) {
|
||||
mkdir(C\CRAWL_DIR."/temp");
|
||||
L\setWorldPermissionsRecursive(C\CRAWL_DIR."/temp/");
|
||||
}
|
||||
$filename = C\CRAWL_DIR."/temp/".$_REQUEST['hash_data'];
|
||||
file_put_contents($filename, $_REQUEST['part'], FILE_APPEND);
|
||||
L\setWorldPermissions($filename);
|
||||
if ($_REQUEST['num_parts'] == $_REQUEST['current_part']) {
|
||||
$upload = true;
|
||||
}
|
||||
} else if (intval($_REQUEST['num_parts']) == 1) {
|
||||
$info_flag = true;
|
||||
$upload = true;
|
||||
$filename = "";
|
||||
}
|
||||
if ($upload) {
|
||||
$logging = $this->handleUploadedData($filename);
|
||||
} else {
|
||||
$logging = "...".(
|
||||
$_REQUEST['current_part']/$_REQUEST['num_parts']).
|
||||
" of data uploaded.";
|
||||
}
|
||||
}
|
||||
$info =[];
|
||||
if ($logging != "") {
|
||||
$info[self::LOGGING] = $logging;
|
||||
}
|
||||
if ($info_flag == true) {
|
||||
$info[self::STATUS] = self::CONTINUE_STATE;
|
||||
} else {
|
||||
$info[self::STATUS] = self::REDO_STATE;
|
||||
if (!$part_flag) {
|
||||
$info[self::SUMMARY] = "Missing request field: $missing.";
|
||||
} else {
|
||||
$info[self::SUMMARY] = "Hash of uploaded data was:".
|
||||
L\crawlHash($_REQUEST['part']).". Sent checksum was:".
|
||||
$_REQUEST['hash_part'];
|
||||
}
|
||||
}
|
||||
$info[self::MEMORY_USAGE] = memory_get_peak_usage();
|
||||
$info[self::POST_MAX_SIZE] = L\metricToInt(ini_get("post_max_size"));
|
||||
if (file_exists(C\CRAWL_DIR . "/schedules/crawl_status.txt")) {
|
||||
$change = false;
|
||||
$crawl_status = unserialize(
|
||||
file_get_contents(C\CRAWL_DIR . "/schedules/crawl_status.txt"));
|
||||
if (isset($_REQUEST['fetcher_peak_memory'])) {
|
||||
if (!isset($crawl_status['FETCHER_MEMORY']) ||
|
||||
$_REQUEST['fetcher_peak_memory'] >
|
||||
$crawl_status['FETCHER_PEAK_MEMORY']
|
||||
) {
|
||||
$crawl_status['FETCHER_PEAK_MEMORY'] =
|
||||
$_REQUEST['fetcher_peak_memory'];
|
||||
$change = true;
|
||||
}
|
||||
}
|
||||
if (!isset($crawl_status['WEBAPP_PEAK_MEMORY']) ||
|
||||
$info[self::MEMORY_USAGE] >
|
||||
$crawl_status['WEBAPP_PEAK_MEMORY']) {
|
||||
$crawl_status['WEBAPP_PEAK_MEMORY'] =
|
||||
$info[self::MEMORY_USAGE];
|
||||
$change = true;
|
||||
}
|
||||
if (!isset($crawl_status[self::CRAWL_TIME])) {
|
||||
$network_filename = C\CRAWL_DIR."/schedules/network_status.txt";
|
||||
if (file_exists($network_filename)) {
|
||||
$info[self::CRAWL_TIME] = unserialize(file_get_contents(
|
||||
$network_filename));
|
||||
$change = true;
|
||||
} else {
|
||||
$info[self::CRAWL_TIME] = 0;
|
||||
}
|
||||
} else {
|
||||
$info[self::CRAWL_TIME] = $crawl_status['CRAWL_TIME'];
|
||||
}
|
||||
if ($change == true) {
|
||||
file_put_contents(C\CRAWL_DIR."/schedules/crawl_status.txt",
|
||||
serialize($crawl_status), LOCK_EX);
|
||||
}
|
||||
} else {
|
||||
$info[self::CRAWL_TIME] = 0;
|
||||
}
|
||||
$info[self::MEMORY_USAGE] = memory_get_peak_usage();
|
||||
$data = [];
|
||||
$data['MESSAGE'] = serialize($info);
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* After robot, schedule, and index data have been uploaded and reassembled
|
||||
* as one big data file/string, this function splits that string into
|
||||
* each of these data types and then save the result into the appropriate
|
||||
* schedule sub-folder. Any temporary files used during uploading are then
|
||||
* deleted.
|
||||
*
|
||||
* @param string $filename name of temp file used to upload big string.
|
||||
* If uploaded data was small enough to be uploaded in one go, then
|
||||
* this should be "" -- the variable $_REQUEST["part"] will be used
|
||||
* instead
|
||||
* @return string $logging diagnostic info to be sent to fetcher about
|
||||
* what was done
|
||||
*/
|
||||
public function handleUploadedData($filename = "")
|
||||
{
|
||||
if ($filename == "") {
|
||||
$uploaded = $_REQUEST['part'];
|
||||
} else {
|
||||
$uploaded = file_get_contents($filename);
|
||||
unlink($filename);
|
||||
}
|
||||
$logging = "... Data upload complete\n";
|
||||
$address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
|
||||
$address = str_replace(":", "_", $address);
|
||||
$time = time();
|
||||
$day = floor($time/C\ONE_DAY);
|
||||
$byte_counts = [];
|
||||
if (isset($_REQUEST['byte_counts'])) {
|
||||
$byte_counts = unserialize(L\webdecode($_REQUEST['byte_counts']));
|
||||
}
|
||||
$robot_data = "";
|
||||
$cache_page_validation_data = "";
|
||||
$schedule_data = "";
|
||||
$index_data = "";
|
||||
if (isset($byte_counts["TOTAL"]) &&
|
||||
$byte_counts["TOTAL"] > 0) {
|
||||
$pos = 0;
|
||||
$robot_data = substr($uploaded, $pos, $byte_counts["ROBOT"]);
|
||||
$pos += $byte_counts["ROBOT"];
|
||||
$cache_page_validation_data = substr($uploaded, $pos,
|
||||
$byte_counts["CACHE_PAGE_VALIDATION"]);
|
||||
$pos += $byte_counts["CACHE_PAGE_VALIDATION"];
|
||||
$schedule_data =
|
||||
substr($uploaded, $pos, $byte_counts["SCHEDULE"]);
|
||||
$pos += $byte_counts["SCHEDULE"];
|
||||
$index_data =
|
||||
substr($uploaded, $pos);
|
||||
}
|
||||
if (strlen($robot_data) > 0) {
|
||||
$this->addScheduleToScheduleDirectory(self::robot_data_base_name,
|
||||
$robot_data);
|
||||
}
|
||||
if (C\USE_ETAG_EXPIRES && strlen($cache_page_validation_data) > 0) {
|
||||
$this->addScheduleToScheduleDirectory(
|
||||
self::etag_expires_data_base_name,
|
||||
$cache_page_validation_data);
|
||||
}
|
||||
if (strlen($schedule_data) > 0) {
|
||||
$this->addScheduleToScheduleDirectory(self::schedule_data_base_name,
|
||||
$schedule_data);
|
||||
}
|
||||
if (strlen($index_data) > 0) {
|
||||
$this->addScheduleToScheduleDirectory(self::index_data_base_name,
|
||||
$index_data);
|
||||
}
|
||||
return $logging;
|
||||
}
|
||||
/**
|
||||
* Adds a file with contents $data and with name containing $address and
|
||||
* $time to a subfolder $day of a folder $dir
|
||||
*
|
||||
* @param string $schedule_name the name of the kind of schedule being saved
|
||||
* @param string& $data_string encoded, compressed, serialized data the
|
||||
* schedule is to contain
|
||||
*/
|
||||
public function addScheduleToScheduleDirectory($schedule_name,
|
||||
&$data_string)
|
||||
{
|
||||
$crawl_time = substr($this->clean($_REQUEST['crawl_time'], "int"), 0,
|
||||
C\TIMESTAMP_LEN);
|
||||
$dir = C\CRAWL_DIR . "/schedules/".$schedule_name . $crawl_time;
|
||||
$address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
|
||||
$address = str_replace(":", "_", $address);
|
||||
$time = time();
|
||||
$day = floor($time/C\ONE_DAY);
|
||||
if (!file_exists($dir)) {
|
||||
mkdir($dir);
|
||||
chmod($dir, 0777);
|
||||
}
|
||||
$dir .= "/$day";
|
||||
if (!file_exists($dir)) {
|
||||
mkdir($dir);
|
||||
chmod($dir, 0777);
|
||||
}
|
||||
$data_hash = L\crawlHash($data_string);
|
||||
file_put_contents($dir."/At".$time."From".$address.
|
||||
"WithHash$data_hash.txt", $data_string);
|
||||
}
|
||||
/**
|
||||
* Checks for the crawl time according either to crawl_status.txt or to
|
||||
* network_status.txt, and presents it to the requesting fetcher, along
|
||||
* with a list of available queue servers.
|
||||
*/
|
||||
public function crawlTime()
|
||||
{
|
||||
$info = [];
|
||||
$info[self::STATUS] = self::CONTINUE_STATE;
|
||||
$view = "fetch";
|
||||
$cron_model = $this->model("cron");
|
||||
if (isset($_REQUEST['crawl_time'])) {;
|
||||
$prev_crawl_time = substr(
|
||||
$this->clean($_REQUEST['crawl_time'], 'int'), 0,
|
||||
C\TIMESTAMP_LEN);
|
||||
} else {
|
||||
$prev_crawl_time = 0;
|
||||
}
|
||||
$cron_time = $cron_model->getCronTime("fetcher_restart");
|
||||
$delta = time() - $cron_time;
|
||||
if ($delta > self::CRON_INTERVAL) {
|
||||
$cron_model->updateCronTime("fetcher_restart");
|
||||
$this->doFetcherCronTasks();
|
||||
} else if ($delta == 0) {
|
||||
$cron_model->updateCronTime("fetcher_restart");
|
||||
}
|
||||
$local_filename = C\CRAWL_DIR."/schedules/crawl_status.txt";
|
||||
$network_filename = C\CRAWL_DIR."/schedules/network_status.txt";
|
||||
if (file_exists($local_filename)) {
|
||||
$crawl_status = unserialize(file_get_contents($local_filename));
|
||||
$crawl_time = (isset($crawl_status["CRAWL_TIME"])) ?
|
||||
$crawl_status["CRAWL_TIME"] : 0;
|
||||
} else if (file_exists($network_filename)){
|
||||
$crawl_time = unserialize(file_get_contents($network_filename));
|
||||
} else {
|
||||
$crawl_time = 0;
|
||||
}
|
||||
$info[self::CRAWL_TIME] = $crawl_time;
|
||||
$status_filename = C\CRAWL_DIR."/schedules/NameServerMessages.txt";
|
||||
if ($crawl_time != 0 && file_exists($status_filename)) {
|
||||
$status = unserialize(file_get_contents($status_filename));
|
||||
if ($status[self::STATUS] == 'STOP_CRAWL') {
|
||||
$info[self::STATUS] == 'STOP_CRAWL';
|
||||
$info[self::CRAWL_TIME] = 0;
|
||||
} else {
|
||||
$tmp_base_dir = C\CRAWL_DIR."/schedules/".
|
||||
self::index_data_base_name . $crawl_time;
|
||||
$tmp_dirs = glob($tmp_base_dir.'/*', GLOB_ONLYDIR);
|
||||
$mult_factor = max(1, count($tmp_dirs));
|
||||
$info[self::MINIMUM_FETCH_LOOP_TIME] = max(min(
|
||||
$mult_factor * C\MINIMUM_FETCH_LOOP_TIME,
|
||||
C\PROCESS_TIMEOUT/2), C\MINIMUM_FETCH_LOOP_TIME);
|
||||
}
|
||||
if ($status[self::STATUS] != 'STOP_CRAWL' &&
|
||||
$crawl_time != $prev_crawl_time) {
|
||||
$to_copy_fields = [self::ALLOWED_SITES, self::ARC_DIR,
|
||||
self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE,
|
||||
self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES,
|
||||
self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL,
|
||||
self::SUMMARIZER_OPTION, self::TOR_PROXY
|
||||
];
|
||||
foreach ($to_copy_fields as $field) {
|
||||
if (isset($status[$field])) {
|
||||
$info[$field] = $status[$field];
|
||||
}
|
||||
}
|
||||
/*
|
||||
When initiating a new crawl AND there are active
|
||||
classifiers (an array of class labels), then augment the
|
||||
info with compressed, serialized versions of each active
|
||||
classifier so that each fetcher can reconstruct the same
|
||||
classifiers.
|
||||
*/
|
||||
$classifier_array = [];
|
||||
if (isset($status[self::ACTIVE_CLASSIFIERS])) {
|
||||
$classifier_array = array_merge(
|
||||
$status[self::ACTIVE_CLASSIFIERS]);
|
||||
$info[self::ACTIVE_CLASSIFIERS] =
|
||||
$status[self::ACTIVE_CLASSIFIERS];
|
||||
}
|
||||
if (isset($status[self::ACTIVE_RANKERS])) {
|
||||
$classifier_array = array_merge($classifier_array,
|
||||
$status[self::ACTIVE_RANKERS]);
|
||||
$info[self::ACTIVE_RANKERS] =
|
||||
$status[self::ACTIVE_RANKERS];
|
||||
}
|
||||
if ($classifier_array != []) {
|
||||
$classifiers_data = Classifier::loadClassifiersData(
|
||||
$classifier_array);
|
||||
$info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data;
|
||||
}
|
||||
}
|
||||
}
|
||||
$info[self::SCRAPERS] = base64_encode(
|
||||
serialize($this->model("scraper")->getAllScrapers()));
|
||||
$info[self::QUEUE_SERVERS] =
|
||||
$this->model("machine")->getQueueServerUrls();
|
||||
$info[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes();
|
||||
$info[self::POST_MAX_SIZE] = L\metricToInt(ini_get("post_max_size"));
|
||||
if (count($info[self::QUEUE_SERVERS]) == 0) {
|
||||
$info[self::QUEUE_SERVERS] = [C\NAME_SERVER];
|
||||
}
|
||||
$data = [];
|
||||
$data['MESSAGE'] = serialize($info);
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* Used to do periodic maintenance tasks for the Name Server.
|
||||
* For now, just checks if any fetchers which the user turned on
|
||||
* have crashed and if so restarts them
|
||||
*/
|
||||
public function doFetcherCronTasks()
|
||||
{
|
||||
$this->model("machine")->restartCrashedFetchers();
|
||||
}
|
||||
/**
|
||||
* Gets a list of all the timestamps of previously stored crawls
|
||||
*
|
||||
* This could probably be moved to crawl model. It is a little lighter
|
||||
* than getCrawlList and should be only used with a name server so leaving
|
||||
* it here so it won't be confused.
|
||||
*
|
||||
* @return array list of timestamps
|
||||
*/
|
||||
public function getCrawlTimes()
|
||||
{
|
||||
$list = [];
|
||||
$dirs = glob(C\CRAWL_DIR.'/cache/*');
|
||||
|
||||
foreach ($dirs as $dir) {
|
||||
if (strlen($pre_timestamp = strstr($dir,
|
||||
self::index_data_base_name)) > 0) {
|
||||
$list[] = substr($pre_timestamp,
|
||||
strlen(self::index_data_base_name));
|
||||
}
|
||||
if (strlen($pre_timestamp = strstr($dir,
|
||||
self::network_base_name)) > 0) {
|
||||
$tmp = substr($pre_timestamp,
|
||||
strlen(self::network_base_name), -4);
|
||||
if (is_numeric($tmp)) {
|
||||
$list[] = $tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
$list = array_unique($list);
|
||||
return $list;
|
||||
}
|
||||
}
|
237
src/controllers/GroupController.php
Normal file
237
src/controllers/GroupController.php
Normal file
|
@ -0,0 +1,237 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop as B;
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\WikiParser;
|
||||
|
||||
/**
|
||||
* Controller used to handle user group activities outside of
|
||||
* the admin panel setting. This either could be because the admin panel
|
||||
* is "collapsed" or because the request concerns a wiki page.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class GroupController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Says which activities (roughly methods invoke from the web) this
|
||||
* controller will respond to (note: more activities will be loaded from
|
||||
* components)
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["groupFeeds", "wiki"];
|
||||
/**
|
||||
* Associative array of $components activities for this controller
|
||||
* Components are collections of activities (a little like traits) which
|
||||
* can be reused.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $component_activities = ["social" => ["groupFeeds", "wiki"]];
|
||||
/**
|
||||
* Used to process requests related to user group activities outside of
|
||||
* the admin panel setting. This either could be because the admin panel
|
||||
* is "collapsed" or because the request concerns a wiki page.
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
if (!C\PROFILE) {
|
||||
return $this->configureRequest();
|
||||
}
|
||||
if (isset($_SESSION['USER_ID'])) {
|
||||
$user_id = $_SESSION['USER_ID'];
|
||||
$data['ADMIN'] = 1;
|
||||
} else {
|
||||
$user_id = $_SERVER['REMOTE_ADDR'];
|
||||
}
|
||||
$signin_model = $this->model("signin");
|
||||
$data['USERNAME'] = $signin_model->getUserName($user_id);
|
||||
$data['SCRIPT'] = "";
|
||||
$token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user_id);
|
||||
$data[C\CSRF_TOKEN] = $this->generateCSRFToken($user_id);
|
||||
if (!$token_okay) {
|
||||
$keep_fields = ["a", "arg", "f", "callback", "group_id",
|
||||
"just_group_id", "just_user_id", "just_thread", "limit", "n",
|
||||
"num", "page_id", "page_name", 'v', "group_name", 'sf'];
|
||||
$request = $_REQUEST;
|
||||
$_REQUEST = [];
|
||||
foreach ($keep_fields as $field) {
|
||||
if (isset($request[$field])) {
|
||||
if ($field == "arg" && (!in_array($request[$field],
|
||||
["read", "pages", "media", "statistics"]) )){
|
||||
continue;
|
||||
}
|
||||
$_REQUEST[$field] =
|
||||
$this->clean($request[$field], "string");
|
||||
}
|
||||
}
|
||||
$_REQUEST["c"] = "group";
|
||||
}
|
||||
$data = array_merge($data, $this->processSession());
|
||||
if (!isset($data['REFRESH'])) {
|
||||
$view = "group";
|
||||
} else {
|
||||
$view = $data['REFRESH'];
|
||||
}
|
||||
if ($data['ACTIVITY_METHOD'] == "wiki") {
|
||||
if (isset($data["VIEW"]) && !isset($data['REFRESH'])) {
|
||||
$view = $data["VIEW"];
|
||||
}
|
||||
} else if (isset($_REQUEST['f']) &&
|
||||
in_array($_REQUEST['f'], ["rss", "json", "serial"])) {
|
||||
$this->setupViewFormatOutput($_REQUEST['f'], $view, $data);
|
||||
}
|
||||
$_SESSION['REMOTE_ADDR'] = $_SERVER['REMOTE_ADDR'];
|
||||
$this->initializeAdFields($data);
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* Used to perform the actual activity call to be done by the
|
||||
* group_controller.
|
||||
* processSession is called from @see processRequest, which does some
|
||||
* cleaning of fields if the CSRFToken is not valid. It is more likely
|
||||
* that that group_controller may be involved in such requests as it can
|
||||
* be invoked either when a user is logged in or not and for users with and
|
||||
* without accounts. processSession makes sure the $_REQUEST'd activity is
|
||||
* valid (or falls back to groupFeeds) then calls it. If someone uses
|
||||
* the Settings link to change the language or default number of feed
|
||||
* elements to view, this method sets up the $data variable so that
|
||||
* the back/cancel button on that page works correctly.
|
||||
*/
|
||||
public function processSession()
|
||||
{
|
||||
if (isset($_REQUEST['a']) &&
|
||||
in_array($_REQUEST['a'], $this->activities)) {
|
||||
$activity = $_REQUEST['a'];
|
||||
} else {
|
||||
$activity = "groupFeeds";
|
||||
}
|
||||
$_SESSION['HIDE_ACTIVITIES'] = false;
|
||||
if (!empty($_SESSION['USER_ID'])) {
|
||||
$this->model("user")->setUserSession($_SESSION['USER_ID'],
|
||||
$_SESSION);
|
||||
}
|
||||
$data['HIDE_ACTIVITIES'] = false;
|
||||
$data = $this->call($activity);
|
||||
$data['ACTIVITY_CONTROLLER'] = "group";
|
||||
$data['ACTIVITY_METHOD'] = $activity; //for settings controller
|
||||
if (!is_array($data)) {
|
||||
$data = [];
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
/**
|
||||
* Responsible for setting the view for a feed if something other
|
||||
* than HTML (for example, RSS or JSON) is desired. It also
|
||||
* sets up any particular $data fields needed for displaying that
|
||||
* view correctly.
|
||||
*
|
||||
* @param string $format can be one of rss, json, or serialize,
|
||||
* if different, default HTML GroupView used.
|
||||
* @param string& $view variable used to set the view in calling
|
||||
* method
|
||||
* @param array& $data used to send data to the view for drawing
|
||||
*/
|
||||
public function setupViewFormatOutput($format, &$view, &$data)
|
||||
{
|
||||
$data["QUERY"] = "groups:feed";
|
||||
if (isset($data["JUST_GROUP_ID"])) {
|
||||
$data["QUERY"] = "groups:just_group_id:".$data["JUST_GROUP_ID"];
|
||||
}
|
||||
if (isset($data["JUST_USER_ID"])) {
|
||||
$data["QUERY"] = "groups:just_user_id:".$data["JUST_USER_ID"];
|
||||
}
|
||||
if (isset($data["JUST_THREAD"])) {
|
||||
$data["QUERY"] = "groups:just_thread:".$data["JUST_THREAD"];
|
||||
}
|
||||
$data["its"] = 0;
|
||||
$num_pages = count($data["PAGES"]);
|
||||
$token = empty($data['admin']) ? "" :
|
||||
C\CSRF_TOKEN . "=". $data[C\CSRF_TOKEN];
|
||||
for ($i = 0; $i < $num_pages; $i++) {
|
||||
$data["PAGES"][$i][self::URL] = htmlentities(B\feedsUrl(
|
||||
"thread", $data["PAGES"][$i]['PARENT_ID'],
|
||||
!empty($data['admin']), $data['CONTROLLER'])) . $token;
|
||||
}
|
||||
switch ($format) {
|
||||
case "rss":
|
||||
$view = "rss";
|
||||
break;
|
||||
case "json":
|
||||
$out_data = [];
|
||||
$out_data["language"] = L\getLocaleTag();
|
||||
$out_data["link"] =
|
||||
C\NAME_SERVER."?f=$format&q={$data['QUERY']}";
|
||||
$out_data["totalResults"] = $data['TOTAL_ROWS'];
|
||||
$out_data["startIndex"] = $data['LIMIT'];
|
||||
$out_data["itemsPerPage"] = $data['RESULTS_PER_PAGE'];
|
||||
foreach ($data['PAGES'] as $page) {
|
||||
$item = [];
|
||||
$item["title"] = $page[self::TITLE];
|
||||
if (!isset($page[self::TYPE]) ||
|
||||
(isset($page[self::TYPE])
|
||||
&& $page[self::TYPE] != "link")) {
|
||||
$item["link"] = $page[self::URL];
|
||||
} else {
|
||||
$item["link"] = strip_tags($page[self::TITLE]);
|
||||
}
|
||||
$item["description"] = strip_tags($page[self::DESCRIPTION]);
|
||||
if (isset($page[self::THUMB])
|
||||
&& $page[self::THUMB] != 'null') {
|
||||
$item["thumb"] = $page[self::THUMB];
|
||||
}
|
||||
if (isset($page[self::TYPE])) {
|
||||
$item["type"] = $page[self::TYPE];
|
||||
}
|
||||
$out_data['item'][] =$item;
|
||||
}
|
||||
$out = json_encode($out_data);
|
||||
//jsonp format
|
||||
if (isset($_REQUEST['callback'])) {
|
||||
$callback = $this->clean($_REQUEST['callback'], "string");
|
||||
$out = "// API callback\n$callback($out);";
|
||||
header("Content-Type: text/javascript; charset=UTF-8");
|
||||
} else {
|
||||
header("Content-Type: application/json");
|
||||
}
|
||||
e($out);
|
||||
exit();
|
||||
case "serial":
|
||||
e(serialize($out_data));
|
||||
exit();
|
||||
}
|
||||
}
|
||||
}
|
114
src/controllers/JobsController.php
Normal file
114
src/controllers/JobsController.php
Normal file
|
@ -0,0 +1,114 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\MediaConstants;
|
||||
use seekquarry\yioop\library\UrlParser;
|
||||
|
||||
/**
|
||||
* This class is used to handle requests from a MediaUpdater to a name server
|
||||
* There are three main types of requests: getUpdateProperties, and
|
||||
* for any job that the MediaUpdater might be running, its getTasks, and
|
||||
* putTasks request. getUpdateProperties is supposed to provide configuration
|
||||
* settings for the MediaUpdater. A MediaUpdater might be running several
|
||||
* periodic jobs. The getTasks requests of a job is used to see if there
|
||||
* is any new work available of that job type on the name server. A
|
||||
* putTasks request is used to handle any computed data sent back from a
|
||||
* MediaUpdater to the name server.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class JobsController extends Controller implements CrawlConstants,
|
||||
MediaConstants
|
||||
{
|
||||
/**
|
||||
* These are the activities supported by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["getUpdateProperties"];
|
||||
/**
|
||||
* Checks that the request seems to be coming from a legitimate
|
||||
* MediaUpdater then determines which job's activity is being
|
||||
* requested and calls that activity for processing.
|
||||
*
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
/* do a quick test to see if this is a request seems like
|
||||
from a legitimate machine
|
||||
*/
|
||||
if (!$this->checkRequest()) {
|
||||
return;
|
||||
}
|
||||
$activity = (isset($_REQUEST['a'])) ? $_REQUEST['a'] : false;
|
||||
if (in_array($activity, $this->activities)) {
|
||||
$this->call($activity);
|
||||
} else if (!empty($_REQUEST['job']) &&
|
||||
!empty($_REQUEST['machine_id']) &&
|
||||
in_array($activity, ["getTasks", "putTasks"])) {
|
||||
$job = $this->clean($_REQUEST['job'], "string");
|
||||
$machine_id = L\webdecode(
|
||||
$this->clean($_REQUEST['machine_id'], "string"));
|
||||
$args = null;
|
||||
if (isset($_REQUEST['args'])) {
|
||||
$args = unserialize(L\webdecode($_REQUEST['args']));
|
||||
}
|
||||
$class_name = C\NS_JOBS . lcfirst($job) . "Job";
|
||||
if (class_exists($class_name)) {
|
||||
$job_object = new $class_name(null, $this);
|
||||
$result = $job_object->$activity($machine_id, $args);
|
||||
echo L\webencode(serialize($result));
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to get the update properties of a media updater. Outputs
|
||||
* either name_server or distributed depending on whether there is
|
||||
* only supposed to be a media updater on the name server or on all
|
||||
* machines in the Yioop instance
|
||||
*/
|
||||
public function getUpdateProperties()
|
||||
{
|
||||
$profile_model = $this->model("profile");
|
||||
$profile = $profile_model->getProfile(C\WORK_DIRECTORY);
|
||||
$response = [];
|
||||
$response['MEDIA_MODE'] = (isset($profile['MEDIA_MODE'])) ?
|
||||
$profile['MEDIA_MODE'] : "name_server";
|
||||
$response['SEND_MAIL_MEDIA_UPDATER'] =
|
||||
(isset($profile['SEND_MAIL_MEDIA_UPDATER'])) ?
|
||||
$profile['SEND_MAIL_MEDIA_UPDATER'] : false;
|
||||
echo L\webencode(serialize($response));
|
||||
}
|
||||
}
|
221
src/controllers/MachineController.php
Normal file
221
src/controllers/MachineController.php
Normal file
|
@ -0,0 +1,221 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\CrawlDaemon;
|
||||
|
||||
/**
|
||||
* This class handles requests from a computer that is managing several
|
||||
* fetchers and queue_servers. This controller might be used to start, stop
|
||||
* fetchers/queue_server as well as get status on the active fetchers
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class MachineController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* These are the activities supported by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["statuses", "update", "log"];
|
||||
/**
|
||||
* Number of characters from end of most recent log file to return
|
||||
* on a log request
|
||||
*/
|
||||
const LOG_LISTING_LEN = 200000;
|
||||
/**
|
||||
* Checks that the request seems to be coming from a legitimate fetcher then
|
||||
* determines which activity the fetcher is requesting and calls that
|
||||
* activity for processing.
|
||||
*
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
/* do a quick test to see if this is a request seems like
|
||||
from a legitimate machine
|
||||
*/
|
||||
if (!$this->checkRequest()) {return; }
|
||||
$activity = $_REQUEST['a'];
|
||||
if (in_array($activity, $this->activities)) {
|
||||
$this->call($activity);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Checks the running/non-running status of the
|
||||
* fetchers and queue_servers of the current Yioop instance
|
||||
*/
|
||||
public function statuses()
|
||||
{
|
||||
if (isset($_REQUEST["arg"])) {
|
||||
$hash_url = $this->clean($_REQUEST["arg"], "string");
|
||||
// the next file tells the MediaUpdater what machine it is
|
||||
file_put_contents(C\WORK_DIRECTORY.
|
||||
"/schedules/current_machine_info.txt",
|
||||
$hash_url);
|
||||
}
|
||||
header("Content-Type: application/json");
|
||||
echo json_encode(CrawlDaemon::statuses());
|
||||
}
|
||||
/**
|
||||
* Used to start/stop a queue_server/fetcher of the current Yioop instance
|
||||
* based on the queue_server and fetcher fields of the current $_REQUEST
|
||||
*/
|
||||
public function update()
|
||||
{
|
||||
if (!isset($_REQUEST['type']) || !isset($_REQUEST['id']) ||
|
||||
!isset($_REQUEST['action'])) { return; }
|
||||
$statuses = CrawlDaemon::statuses();
|
||||
switch ($_REQUEST['type']) {
|
||||
case 'QueueServer':
|
||||
if ($_REQUEST['action'] == "start" &&
|
||||
!isset($statuses["QueueServer"][-1])) {
|
||||
CrawlDaemon::start("QueueServer", 'none',
|
||||
self::INDEXER, 0);
|
||||
CrawlDaemon::start("QueueServer", 'none',
|
||||
self::SCHEDULER, 2);
|
||||
} else if ($_REQUEST['action'] == "stop" &&
|
||||
isset($statuses["QueueServer"][-1]) ) {
|
||||
CrawlDaemon::stop("QueueServer");
|
||||
}
|
||||
break;
|
||||
case 'Mirror':
|
||||
if ($_REQUEST['action'] == "start" &&
|
||||
!isset($statuses["Mirror"][-1])) {
|
||||
$parent = (isset($_REQUEST['parent'])) ?
|
||||
$this->clean($_REQUEST['parent'], 'string') : "";
|
||||
if ($parent) {
|
||||
file_put_contents(C\CRAWL_DIR .
|
||||
"/schedules/mirror_parent.txt",
|
||||
L\webdecode($parent));
|
||||
}
|
||||
CrawlDaemon::start("Mirror");
|
||||
} else if ($_REQUEST['Mirror'] == "stop" &&
|
||||
isset($statuses["Mirror"][-1]) ) {
|
||||
CrawlDaemon::stop("Mirror");
|
||||
}
|
||||
break;
|
||||
case 'MediaUpdater':
|
||||
if ($_REQUEST['action'] == "start" &&
|
||||
!isset($statuses["MediaUpdater"][-1])) {
|
||||
CrawlDaemon::start("MediaUpdater");
|
||||
} else if ($_REQUEST["action"] == "stop" &&
|
||||
isset($statuses["MediaUpdater"][-1]) ) {
|
||||
CrawlDaemon::stop("MediaUpdater");
|
||||
}
|
||||
break;
|
||||
case 'Fetcher':
|
||||
$id = $_REQUEST['id'];
|
||||
if ($_REQUEST['action'] == "start" &&
|
||||
!isset($statuses["Fetcher"][$id ]) ) {
|
||||
CrawlDaemon::start("Fetcher", "$id");
|
||||
} else if ($_REQUEST['action'] == "stop" &&
|
||||
isset($statuses["Fetcher"][$id]) ) {
|
||||
CrawlDaemon::stop("Fetcher", "$id");
|
||||
}
|
||||
break;
|
||||
case 'RestartFetcher':
|
||||
$error_log = C\CRASH_LOG_NAME;
|
||||
$id = $_REQUEST['id'];
|
||||
$msg = "Restarting Fetcher $id";
|
||||
$time_string = date("r", time());
|
||||
$out_msg = "[$time_string] $msg\n";
|
||||
$lines = L\tail(C\LOG_DIR."/$id-Fetcher.log", 10);
|
||||
foreach ($lines as $line) {
|
||||
$out_msg .= "!!!!$line\n";
|
||||
}
|
||||
if (!file_exists($error_log) || filesize($error_log) >
|
||||
C\MAX_LOG_FILE_SIZE) {
|
||||
file_put_contents($error_log, $out_msg);
|
||||
} else {
|
||||
file_put_contents($error_log, $out_msg,
|
||||
FILE_APPEND);
|
||||
}
|
||||
CrawlDaemon::start("Fetcher", "$id");
|
||||
break;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to retrieve a fetcher/queue_server logfile for the the current
|
||||
* Yioop instance
|
||||
*/
|
||||
public function log()
|
||||
{
|
||||
$log_data = "";
|
||||
if (!isset($_REQUEST["type"])) {
|
||||
echo json_encode(urlencode($log_data));
|
||||
return;
|
||||
}
|
||||
switch ($_REQUEST["type"]) {
|
||||
case "Fetcher":
|
||||
$fetcher_num = $this->clean($_REQUEST["id"], "int");
|
||||
$log_file_name = C\LOG_DIR . "/{$fetcher_num}-Fetcher.log";
|
||||
break;
|
||||
case "MediaUpdater":
|
||||
case "Mirror":
|
||||
case "QueueServer":
|
||||
$log_file_name = C\LOG_DIR . "/".$_REQUEST["type"].".log";
|
||||
break;
|
||||
}
|
||||
$filter = "";
|
||||
if (isset($_REQUEST["f"])) {
|
||||
$filter = $this->clean($_REQUEST["f"], "string");
|
||||
}
|
||||
if (file_exists($log_file_name)) {
|
||||
$size = filesize($log_file_name);
|
||||
$len = min(self::LOG_LISTING_LEN, $size);
|
||||
$fh = fopen($log_file_name, "r");
|
||||
if ($fh) {
|
||||
fseek($fh, $size - $len);
|
||||
$log_data = fread($fh, $len);
|
||||
fclose($fh);
|
||||
}
|
||||
if ($filter != "" && strlen($log_data) > 0) {
|
||||
$log_lines = explode("\n", $log_data);
|
||||
$out_lines = [];
|
||||
foreach ($log_lines as $line) {
|
||||
if (stristr($line, $filter)) {
|
||||
$out_lines[] = $line;
|
||||
}
|
||||
}
|
||||
if (count($out_lines) == 0) {
|
||||
$out_lines[] = tl('machine_controller_nolines');
|
||||
}
|
||||
$log_data = implode("\n", $out_lines);
|
||||
}
|
||||
}
|
||||
header("Content-Type: application/json");
|
||||
echo json_encode(L\webencode($log_data));
|
||||
}
|
||||
}
|
1426
src/controllers/RegisterController.php
Normal file
1426
src/controllers/RegisterController.php
Normal file
File diff suppressed because it is too large
Load diff
384
src/controllers/ResourceController.php
Normal file
384
src/controllers/ResourceController.php
Normal file
|
@ -0,0 +1,384 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\FetchUrl;
|
||||
use seekquarry\yioop\library\UrlParser;
|
||||
use seekquarry\yioop\library\MediaConstants;
|
||||
use seekquarry\yioop\library\processors\ImageProcessor;
|
||||
|
||||
/**
|
||||
* Used to serve resources, css, or scripts such as images from APP_DIR
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class ResourceController extends Controller implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* These are the activities supported by this controller
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["get", "syncList", "syncNotify", "suggest"];
|
||||
/**
|
||||
* Checks that the request seems to be coming from a legitimate fetcher
|
||||
* or mirror server then determines which activity is being requested
|
||||
* and calls the method for that activity.
|
||||
*
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
if ((isset($_REQUEST['a']) && in_array(
|
||||
$_REQUEST['a'], ["get", "suggest"]))
|
||||
|| $this->checkRequest()) {
|
||||
$activity = $_REQUEST['a'];
|
||||
if (in_array($activity, $this->activities)) {
|
||||
$this->call($activity);
|
||||
return;
|
||||
}
|
||||
}
|
||||
$this->requestError();
|
||||
}
|
||||
/**
|
||||
* Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or
|
||||
* CRAWL_DIR/$_REQUEST['f'] after cleaning
|
||||
*/
|
||||
public function get()
|
||||
{
|
||||
if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) {
|
||||
return;
|
||||
}
|
||||
$name = $this->clean($_REQUEST['n'], "file_name");
|
||||
if (in_array($_REQUEST['f'], ["css", "scripts", "resources"])) {
|
||||
/* notice in this case we didn't check if request come from a
|
||||
legitimate source but we do try to restrict it to being
|
||||
a file (not a folder) in the above array. If the request
|
||||
is for a file in resources, then if it is for a private
|
||||
group, we will check in getBaseFolder if the request is legit
|
||||
*/
|
||||
$base_dir = $this->getBaseFolder();
|
||||
if (!$base_dir) {
|
||||
$this->requestError();
|
||||
}
|
||||
$type = UrlParser::getDocumentType($name);
|
||||
if (!empty($_REQUEST['t']) && $_REQUEST['t'] == 'feed') {
|
||||
$type = "";
|
||||
}
|
||||
$name = UrlParser::getDocumentFilename($name);
|
||||
$name = ($type != "") ? "$name.$type":$name;
|
||||
if (!empty($_REQUEST['t'])) {
|
||||
$name .= ".jpg";
|
||||
}
|
||||
$sub_path = "";
|
||||
if (!empty($_REQUEST['sf'])) {
|
||||
$sub_path = $this->clean($_REQUEST['sf'], "string");
|
||||
$sub_path = str_replace(".", "", $sub_path) . "/";
|
||||
if ($sub_path == "/") {
|
||||
$sub_path = "";
|
||||
}
|
||||
}
|
||||
$name = $sub_path . $name;
|
||||
} else if (in_array($_REQUEST['f'], ["cache"])) {
|
||||
/* perform check since these request should come from a known
|
||||
machine
|
||||
*/
|
||||
if (!$this->checkRequest()) {
|
||||
$this->requestError();
|
||||
}
|
||||
$folder = $_REQUEST['f'];
|
||||
$base_dir = C\CRAWL_DIR."/$folder";
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) {
|
||||
$offset = $this->clean($_REQUEST['o'], "int");
|
||||
$limit = $this->clean($_REQUEST['l'], "int");
|
||||
}
|
||||
$path = "$base_dir/$name";
|
||||
if (isset($_REQUEST['t']) && $_REQUEST['t'] == 'feed' &&
|
||||
!file_exists($path) && file_exists("$path.txt")) {
|
||||
$image_url = file_get_contents("$path.txt");
|
||||
if (!empty($image_url)) {
|
||||
$image_page = FetchUrl::getPage($image_url);
|
||||
restore_error_handler();
|
||||
$image = @imagecreatefromstring($image_page);
|
||||
set_error_handler(C\NS_LIB . "yioop_error_handler");
|
||||
$thumb = ImageProcessor::createThumb($image);
|
||||
if (!empty($thumb)) {
|
||||
file_put_contents($path, $thumb);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (file_exists($path)) {
|
||||
$path = realpath($path);
|
||||
$mime_type = L\mimeType($path);
|
||||
$size = filesize($path);
|
||||
$start = 0;
|
||||
$end = $size - 1;
|
||||
header("Content-type: $mime_type");
|
||||
header('Content-Disposition: filename="' .$name. '"');
|
||||
header("Accept-Ranges: bytes");
|
||||
if (isset($_SERVER['HTTP_RANGE'])) {
|
||||
$this->serveRangeRequest($path, $size, $start, $end);
|
||||
return;
|
||||
}
|
||||
header("Content-Length: ".$size);
|
||||
header("Content-Range: bytes $start-$end/$size");
|
||||
if (isset($offset) && isset($limit)) {
|
||||
echo file_get_contents($path, false, null, $offset, $limit);
|
||||
} else {
|
||||
readfile($path);
|
||||
}
|
||||
} else {
|
||||
$this->requestError();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Handles requests that result in errors to this controller
|
||||
*/
|
||||
public function requestError()
|
||||
{
|
||||
header("Location:".C\BASE_URL."/error.php");
|
||||
return;
|
||||
}
|
||||
/**
|
||||
* Computes based on the request the folder that should be used to
|
||||
* find a file during a resource get request. It also checks if user
|
||||
* has access to the requested folder.
|
||||
*
|
||||
* @return mixed either a string with the folder name in it or false if
|
||||
* the user does not have access or that folder does not exist.
|
||||
*/
|
||||
public function getBaseFolder()
|
||||
{
|
||||
$folder = $this->clean($_REQUEST['f'], 'string');
|
||||
$base_dir = C\APP_DIR . "/$folder";
|
||||
$add_to_path = false;
|
||||
$is_group_item = false;
|
||||
$page_id = "";
|
||||
if (isset($_REQUEST['s'])&& !isset($_REQUEST['g']) &&
|
||||
$folder == "resources") {
|
||||
// handle sub-folders of resource (must be numeric)
|
||||
$subfolder = $this->clean($_REQUEST['s'], "hash");
|
||||
$prefix_folder = substr($subfolder, 0, 3);
|
||||
$add_to_path = true;
|
||||
} else if (isset($_REQUEST['g'])) {
|
||||
$user_id = isset($_SESSION['USER_ID']) ? $_SESSION['USER_ID'] :
|
||||
C\PUBLIC_USER_ID;
|
||||
if (isset($_REQUEST['p'])) {
|
||||
$page_id = $this->clean($_REQUEST['p'], 'string');
|
||||
}
|
||||
$group_id = $this->clean($_REQUEST['g'], "int");
|
||||
$group_model = $this->model('group');
|
||||
$token_okay = true;
|
||||
$pre_token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user_id);
|
||||
if ($group_id == C\PUBLIC_GROUP_ID) {
|
||||
$user_id = C\PUBLIC_USER_ID;
|
||||
} else {
|
||||
$token_okay = $pre_token_okay;
|
||||
if (empty($_COOKIE) && stristr($_SERVER['HTTP_USER_AGENT'],
|
||||
"Mobile") !== false && stristr($_SERVER['HTTP_USER_AGENT'],
|
||||
"Safari") !== false) {
|
||||
header('HTTP/1.0 403 Forbidden');
|
||||
//fixes mobile safari no send cookie bug
|
||||
exit();
|
||||
}
|
||||
}
|
||||
$group = $group_model->getGroupById($group_id, $user_id);
|
||||
if (!$group || !$token_okay) {
|
||||
return false;
|
||||
}
|
||||
$prefix_word = (isset($_REQUEST['t'])) ? 't' : '';
|
||||
$base_subfolder = L\crawlHash(
|
||||
'group' . $group_id. $page_id . C\AUTH_KEY);
|
||||
$prefix_folder = substr($base_subfolder, 0, 3);
|
||||
$subfolder = $prefix_word . $base_subfolder;
|
||||
$add_to_path = true;
|
||||
$is_group_item = true;
|
||||
}
|
||||
if ($add_to_path) {
|
||||
if ($is_group_item) {
|
||||
$redirect_dir = "$base_dir/$prefix_folder/$base_subfolder";
|
||||
}
|
||||
if ($is_group_item &&
|
||||
file_exists($redirect_dir . "/redirect.txt")) {
|
||||
$tmp_path = file_get_contents($redirect_dir . "/redirect.txt");
|
||||
if (is_dir($tmp_path)) {
|
||||
if ($subfolder == $base_subfolder) {
|
||||
$base_dir = $tmp_path;
|
||||
} else {
|
||||
$subfolder = L\crawlHash($tmp_path);
|
||||
$prefix_folder = substr($subfolder, 0, 3);
|
||||
$subfolder = $prefix_word . $subfolder;
|
||||
$base_dir .= "/$prefix_folder/$subfolder";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$base_dir .= "/$prefix_folder/$subfolder";
|
||||
}
|
||||
}
|
||||
return $base_dir;
|
||||
}
|
||||
/**
|
||||
* Code to handle HTTP range requests of resources. This allows
|
||||
* HTTP pseudo-streaming of video. This code was inspired by:
|
||||
* http://www.tuxxin.com/php-mp4-streaming/
|
||||
*
|
||||
* @param string $file Name of file to serve range request for
|
||||
* @param int $size size of the file in bytes
|
||||
* @param int $start starting byte location want to serve
|
||||
* @param int $end ending byte location want ot serve
|
||||
*/
|
||||
public function serveRangeRequest($file, $size, $start, $end)
|
||||
{
|
||||
$current_start = $start;
|
||||
$current_end = $end;
|
||||
list(, $range) = explode('=', $_SERVER['HTTP_RANGE'], 2);
|
||||
if (strpos($range, ',') !== false) {
|
||||
header('HTTP/1.1 416 Requested Range Not Satisfiable');
|
||||
header("Content-Range: bytes $start-$end/$size");
|
||||
return;
|
||||
}
|
||||
if ($range == '-') {
|
||||
$current_start = $size - 1;
|
||||
} else {
|
||||
$range = explode('-', $range);
|
||||
$current_start = trim($range[0]);
|
||||
$current_end = (isset($range[1]) && is_numeric(trim($range[1])))
|
||||
? trim($range[1]) : $size;
|
||||
if ($current_start === "") {
|
||||
$current_start = max(0, $size - $range[1] - 1);
|
||||
}
|
||||
}
|
||||
$current_end = ($current_end > $end) ? $end : $current_end;
|
||||
if ($current_start > $current_end || $current_start > $size - 1 ||
|
||||
$current_end >= $size) {
|
||||
header('HTTP/1.1 416 Requested Range Not Satisfiable');
|
||||
header("Content-Range: bytes $start-$end/$size");
|
||||
return;
|
||||
}
|
||||
$start = $current_start;
|
||||
$end = $current_end;
|
||||
$length = $end - $start + 1;
|
||||
$fp = @fopen($file, 'rb');
|
||||
fseek($fp, $start);
|
||||
header('HTTP/1.1 206 Partial Content');
|
||||
header("Content-Range: bytes $start-$end/$size");
|
||||
header("Content-Length: ".$length);
|
||||
$buffer = 8192;
|
||||
$position = 0;
|
||||
while(!feof($fp) && $position <= $end && connection_status() == 0) {
|
||||
$position = ftell($fp);
|
||||
if ($position + $buffer > $end) {
|
||||
$buffer = $end - $position + 1;
|
||||
}
|
||||
echo fread($fp, $buffer);
|
||||
flush();
|
||||
}
|
||||
fclose($fp);
|
||||
}
|
||||
/**
|
||||
* Used to get a keyword suggest trie. This sends additional
|
||||
* header so will be decompressed on the fly
|
||||
*/
|
||||
public function suggest()
|
||||
{
|
||||
if (!isset($_REQUEST["locale"])){return;}
|
||||
$locale = $_REQUEST["locale"];
|
||||
$count = preg_match("/^[a-zA-z]{2}(-[a-zA-z]{2})?$/", $locale);
|
||||
if ($count != 1) {return;}
|
||||
$locale = str_replace("-", "_", $locale);
|
||||
$path = C\LOCALE_DIR."/$locale/resources/suggest_trie.txt.gz";
|
||||
if (file_exists($path)) {
|
||||
header("Content-Type: application/json");
|
||||
header("Content-Encoding: gzip");
|
||||
header("Content-Length: ".filesize($path));
|
||||
readfile($path);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to notify a machine that another machine acting as a mirror
|
||||
* is still alive. Data is stored in a txt file self::mirror_table_name
|
||||
*/
|
||||
public function syncNotify()
|
||||
{
|
||||
if (isset($_REQUEST['last_sync']) && $_REQUEST['last_sync'] > 0 ) {
|
||||
$mirror_table_name = C\CRAWL_DIR."/".self::mirror_table_name;
|
||||
$mirror_table = [];
|
||||
$time = time();
|
||||
if (file_exists($mirror_table_name) ) {
|
||||
$mirror_table = unserialize(
|
||||
file_get_contents($mirror_table_name));
|
||||
if (isset($mirror_table['time']) &&
|
||||
$mirror_table['time'] - $time > C\MIRROR_SYNC_FREQUENCY) {
|
||||
$mirror_table = [];
|
||||
// truncate table periodically to get rid of stale entries
|
||||
}
|
||||
}
|
||||
if (isset($_REQUEST['robot_instance'])) {
|
||||
$mirror_table['time'] = $time;
|
||||
$mirror_table['machines'][
|
||||
$this->clean($_REQUEST['robot_instance'], "string")] =
|
||||
[$_SERVER['REMOTE_ADDR'], $_REQUEST['machine_uri'],
|
||||
$time,
|
||||
$this->clean($_REQUEST['last_sync'], "int")];
|
||||
file_put_contents($mirror_table_name, serialize($mirror_table));
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Returns a list of syncable files and the modification times
|
||||
*/
|
||||
public function syncList()
|
||||
{
|
||||
$this->syncNotify();
|
||||
$info = [];
|
||||
if (isset($_REQUEST["last_sync"])) {
|
||||
$last_sync = $this->clean($_REQUEST["last_sync"], "int");
|
||||
} else {
|
||||
$last_sync = 0;
|
||||
}
|
||||
// substrings to exclude from our list
|
||||
$excludes = [".DS", "__MACOSX", "queries", "QueueBundle", "tmp",
|
||||
"thumb"];
|
||||
$sync_files = $this->model("crawl")->getDeltaFileInfo(
|
||||
C\CRAWL_DIR."/cache", $last_sync, $excludes);
|
||||
if (count($sync_files) > 0 ) {
|
||||
$info[self::STATUS] = self::CONTINUE_STATE;
|
||||
$info[self::DATA] = $sync_files;
|
||||
} else {
|
||||
$info[self::STATUS] = self::NO_DATA_STATE;
|
||||
}
|
||||
echo base64_encode(gzcompress(serialize($info)));
|
||||
}
|
||||
}
|
2078
src/controllers/SearchController.php
Normal file
2078
src/controllers/SearchController.php
Normal file
File diff suppressed because it is too large
Load diff
184
src/controllers/SettingsController.php
Normal file
184
src/controllers/SettingsController.php
Normal file
|
@ -0,0 +1,184 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop as B;
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
|
||||
/**
|
||||
* Controller used to handle search requests to SeekQuarry
|
||||
* search site. Used to both get and display
|
||||
* search results.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class SettingsController extends Controller
|
||||
{
|
||||
/**
|
||||
* Sets up the available perpage language options.
|
||||
* If handling data sent from a form, it stores cleaned versions of
|
||||
* the number of results per page and language options into a sesssion
|
||||
*
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
$view = "settings";
|
||||
$changed_settings_flag = false;
|
||||
$crawl_model = $this->model("crawl");
|
||||
if (isset($_SESSION['USER_ID']) && isset($_REQUEST[C\CSRF_TOKEN])) {
|
||||
$user = $_SESSION['USER_ID'];
|
||||
$token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user);
|
||||
$data['ADMIN'] = 1;
|
||||
} else {
|
||||
$user = $_SERVER['REMOTE_ADDR'];
|
||||
$token_okay = true;
|
||||
}
|
||||
if (!$token_okay) {
|
||||
$user = $_SERVER['REMOTE_ADDR'];
|
||||
unset($_SESSION['USER_ID']);
|
||||
}
|
||||
$data[C\CSRF_TOKEN] = $this->generateCSRFToken($user);
|
||||
$languages = $this->model("locale")->getLocaleList();
|
||||
foreach ($languages as $language) {
|
||||
$data['LANGUAGES'][$language['LOCALE_TAG']] =
|
||||
$language['LOCALE_NAME'];
|
||||
}
|
||||
if ($token_okay && isset($_REQUEST['lang']) &&
|
||||
in_array($_REQUEST['lang'], array_keys($data['LANGUAGES']))) {
|
||||
$_SESSION['l'] = $_REQUEST['lang'];
|
||||
L\setLocaleObject( $_SESSION['l']);
|
||||
$changed_settings_flag = true;
|
||||
}
|
||||
$data['LOCALE_TAG'] = L\getLocaleTag();
|
||||
$n = C\NUM_RESULTS_PER_PAGE;
|
||||
$data['PER_PAGE'] =
|
||||
[$n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n];
|
||||
if ($token_okay && isset($_REQUEST['perpage']) &&
|
||||
in_array($_REQUEST['perpage'], array_keys($data['PER_PAGE']))) {
|
||||
$_SESSION['MAX_PAGES_TO_SHOW'] = $_REQUEST['perpage'];
|
||||
$changed_settings_flag = true;
|
||||
}
|
||||
if (isset($_SESSION['MAX_PAGES_TO_SHOW'])){
|
||||
$data['PER_PAGE_SELECTED'] = $_SESSION['MAX_PAGES_TO_SHOW'];
|
||||
} else {
|
||||
$data['PER_PAGE_SELECTED'] = C\NUM_RESULTS_PER_PAGE;
|
||||
}
|
||||
if ($token_okay && isset($_REQUEST['perpage'])) {
|
||||
$_SESSION['OPEN_IN_TABS'] = (isset($_REQUEST['open_in_tabs'])) ?
|
||||
true : false;
|
||||
}
|
||||
if (isset($_SESSION['OPEN_IN_TABS'])){
|
||||
$data['OPEN_IN_TABS'] = $_SESSION['OPEN_IN_TABS'];
|
||||
} else {
|
||||
$data['OPEN_IN_TABS'] = false;
|
||||
}
|
||||
$machine_urls = $this->model("machine")->getQueueServerUrls();
|
||||
$crawls = $crawl_model->getCrawlList(false, true, $machine_urls,
|
||||
true);
|
||||
$data['CRAWLS'] = [];
|
||||
foreach ($crawls as $crawl) {
|
||||
$data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'].
|
||||
" ... ".$crawl['COUNT']." urls";
|
||||
}
|
||||
$mixes = $crawl_model->getMixList($user);
|
||||
if (!empty($mixes)) {
|
||||
foreach ($mixes as $mix) {
|
||||
$data['CRAWLS'][$mix['TIMESTAMP']] = $mix['NAME'].
|
||||
" ... ".tl('settings_controller_crawl_mix');
|
||||
}
|
||||
}
|
||||
$crawl_stamps = array_keys($data['CRAWLS']);
|
||||
if ($token_okay) {
|
||||
$changed_settings_flag = $this->loggedInChangeSettings($data);
|
||||
} else if (isset($_REQUEST['its']) &&
|
||||
in_array($_REQUEST['its'],$crawl_stamps)){
|
||||
$data['its'] = $_REQUEST['its'];
|
||||
} else {
|
||||
$data['its'] = $crawl_model->getCurrentIndexDatabaseName();
|
||||
}
|
||||
if ($changed_settings_flag) {
|
||||
$this->model("user")->setUserSession($user, $_SESSION);
|
||||
return $this->redirectWithMessage(
|
||||
tl('settings_controller_settings_saved'),
|
||||
['return', 'oldc']);
|
||||
}
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* Changes settings for a logged in user, this might involve storing
|
||||
* data into the active session.
|
||||
*
|
||||
* @param array& $data fields which might be sent to the view
|
||||
* @return bool if any settings were changed
|
||||
*/
|
||||
public function loggedInChangeSettings(&$data)
|
||||
{
|
||||
$crawl_model = $this->model("crawl");
|
||||
$crawl_stamps = array_keys($data['CRAWLS']);
|
||||
$changed_settings_flag = false;
|
||||
if (isset($_REQUEST['index_ts']) &&
|
||||
in_array($_REQUEST['index_ts'], $crawl_stamps)) {
|
||||
$_SESSION['its'] = $_REQUEST['index_ts'];
|
||||
$data['its'] = $_REQUEST['index_ts'];
|
||||
$changed_settings_flag = true;
|
||||
} else if (isset($_SESSION['its']) &&
|
||||
in_array($_SESSION['its'], $crawl_stamps)) {
|
||||
$data['its'] = $_SESSION['its'];
|
||||
} else {
|
||||
$data['its'] = $crawl_model->getCurrentIndexDatabaseName();
|
||||
}
|
||||
if (isset($_REQUEST['return'])) {
|
||||
$c = "admin";
|
||||
if (isset($_REQUEST['oldc'])) {
|
||||
$c = $this->clean($_REQUEST['oldc'], "string");
|
||||
$data['oldc'] = $c;
|
||||
}
|
||||
$return = $this->clean($_REQUEST['return'], 'string');
|
||||
$data['return'] = $return;
|
||||
$delim = "?";
|
||||
if (C\REDIRECTS_ON && $c == 'search' && $return == 'more') {
|
||||
$data['RETURN'] = B\moreUrl();
|
||||
} else if ( substr($return, 0, 2) == 's/') {
|
||||
$data['RETURN'] = B\subsearchUrl(substr($return, 2));
|
||||
} else {
|
||||
$data['RETURN'] = B\controllerUrl($c, true) . "a=$return";
|
||||
$delim = '&';
|
||||
}
|
||||
if (!empty($data['ADMIN'])) {
|
||||
$data['RETURN'] .= $delim .
|
||||
C\CSRF_TOKEN."=".$data[C\CSRF_TOKEN];
|
||||
}
|
||||
}
|
||||
return $changed_settings_flag;
|
||||
}
|
||||
}
|
237
src/controllers/StaticController.php
Normal file
237
src/controllers/StaticController.php
Normal file
|
@ -0,0 +1,237 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
|
||||
/**
|
||||
* This controller is used by the Yioop web site to display
|
||||
* PUBLIC_GROUP_ID pages more like static forward facing pages.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class StaticController extends Controller
|
||||
{
|
||||
/**
|
||||
* Says which activities (roughly methods invoke from the web)
|
||||
* this controller will respond to
|
||||
* @var array
|
||||
*/
|
||||
public $activities = ["showPage", "signout"];
|
||||
/**
|
||||
* This is the main entry point for handling people arriving to view
|
||||
* a static page. It determines which page to draw and class the view
|
||||
* to draw it.
|
||||
*/
|
||||
public function processRequest()
|
||||
{
|
||||
$data = [];
|
||||
$view = "static";
|
||||
if (isset($_SESSION['USER_ID'])) {
|
||||
$user = $_SESSION['USER_ID'];
|
||||
} else {
|
||||
$user = $_SERVER['REMOTE_ADDR'];
|
||||
}
|
||||
if (isset($_REQUEST['a'])) {
|
||||
if (in_array($_REQUEST['a'], $this->activities)) {
|
||||
$activity = $_REQUEST['a'];
|
||||
if ($activity == "signout") {
|
||||
$data['SCRIPT'] = "doMessage('<h1 class=\"red\" >".
|
||||
tl('static_controller_logout_successful')."</h1>')";
|
||||
$activity = "showPage";
|
||||
}
|
||||
} else {
|
||||
$activity = "showPage";
|
||||
}
|
||||
} else {
|
||||
$activity = "showPage";
|
||||
}
|
||||
$data['VIEW'] = $view;
|
||||
$data = array_merge($data, $this->call($activity));
|
||||
if (isset($_SESSION['USER_ID'])) {
|
||||
$user = $_SESSION['USER_ID'];
|
||||
} else {
|
||||
$user = $_SERVER['REMOTE_ADDR'];
|
||||
}
|
||||
$data[C\CSRF_TOKEN] = $this->generateCSRFToken($user);
|
||||
if (isset($_SESSION['USER_ID'])) {
|
||||
$user_id = $_SESSION['USER_ID'];
|
||||
$data['ADMIN'] = 1;
|
||||
} else {
|
||||
$user_id = $_SERVER['REMOTE_ADDR'];
|
||||
}
|
||||
$this->initializeAdFields($data);
|
||||
$this->displayView($view, $data);
|
||||
}
|
||||
/**
|
||||
* This activity is used to display one a PUBLIC_GROUP_ID pages used
|
||||
* by the Yioop Web Site
|
||||
*
|
||||
* @return array $data has title and page contents of the static page to
|
||||
* display
|
||||
*/
|
||||
public function showPage()
|
||||
{
|
||||
if (isset($_SESSION['USER_ID'])) {
|
||||
$user = $_SESSION['USER_ID'];
|
||||
} else {
|
||||
$user = $_SERVER['REMOTE_ADDR'];
|
||||
}
|
||||
$data = [];
|
||||
if (isset($_REQUEST['p'])) {
|
||||
$page = $this->clean($_REQUEST['p'], "string");
|
||||
$page = preg_replace("@(\.\.|\/)@", "", $page);
|
||||
} else {
|
||||
$page = "404";
|
||||
}
|
||||
$page_string = $this->getPage($page);
|
||||
if ($page_string == "") {
|
||||
$page = "404";
|
||||
$page_string = $this->getPage($page);
|
||||
}
|
||||
if (strpos($page_string, "`") !== false){
|
||||
if (isset($data["INCLUDE_SCRIPTS"])) {
|
||||
$data["INCLUDE_SCRIPTS"] = [];
|
||||
}
|
||||
$data["INCLUDE_SCRIPTS"][] = "math";
|
||||
}
|
||||
$data['page'] = $page;
|
||||
$static_view = $this->view("static");
|
||||
$this->parsePageHeadVarsView($static_view, $page, $page_string);
|
||||
if (isset($_SESSION['value'])) {
|
||||
$data['value'] = $this->clean($_SESSION['value'], "string");
|
||||
}
|
||||
$head_info = $static_view->head_objects[$data['page']];
|
||||
if (isset($head_info['page_type']) &&
|
||||
$head_info['page_type'] == 'page_alias' &&
|
||||
$head_info['page_alias'] != '' ) {
|
||||
$_REQUEST['p'] = $head_info['page_alias'];
|
||||
return $this->redirectWithMessage("", ['p']);
|
||||
}
|
||||
if ((isset($head_info['title']))) {
|
||||
if ($head_info['title']) {
|
||||
$data["subtitle"] = " - ".$head_info['title'];
|
||||
} else {
|
||||
$data["subtitle"] = "";
|
||||
}
|
||||
$static_view->head_objects[$data['page']]['title'] =
|
||||
tl('static_controller_complete_title', $head_info['title']);
|
||||
} else {
|
||||
$data["subtitle"] = "";
|
||||
}
|
||||
$locale_tag = L\getLocaleTag();
|
||||
$data['CONTROLLER'] = "static";
|
||||
$group_model = $this->model("group");
|
||||
if (!empty($head_info['page_header']) &&
|
||||
$head_info['page_type'] != 'presentation') {
|
||||
$page_header = $group_model->getPageInfoByName(C\PUBLIC_GROUP_ID,
|
||||
$head_info['page_header'], $locale_tag, "read");
|
||||
if (isset($page_header['PAGE'])) {
|
||||
$header_parts =
|
||||
explode("END_HEAD_VARS", $page_header['PAGE']);
|
||||
}
|
||||
$data["PAGE_HEADER"] = (isset($header_parts[1])) ?
|
||||
$header_parts[1] : "".$page_header['PAGE'];
|
||||
}
|
||||
if (!empty($head_info['page_footer']) &&
|
||||
$head_info['page_type'] != 'presentation') {
|
||||
$page_footer = $group_model->getPageInfoByName(C\PUBLIC_GROUP_ID,
|
||||
$head_info['page_footer'], $locale_tag, "read");
|
||||
if (isset($page_footer['PAGE'])) {
|
||||
$footer_parts =
|
||||
explode("END_HEAD_VARS", $page_footer['PAGE']);
|
||||
}
|
||||
$data['PAGE_FOOTER'] = (isset($footer_parts[1])) ?
|
||||
$footer_parts[1] : "" . $page_footer['PAGE'];
|
||||
}
|
||||
$data['PAGE_ID'] = $group_model->getPageID(C\PUBLIC_GROUP_ID,
|
||||
$page, $locale_tag);
|
||||
if (!empty($_REQUEST['sf'])) {
|
||||
$sub_path = $this->clean($_REQUEST['sf'], 'string');
|
||||
$sub_path = str_replace(".", "", $sub_path);
|
||||
$data['SUB_PATH'] = htmlentities($sub_path);
|
||||
} else {
|
||||
$sub_path = "";
|
||||
}
|
||||
if (!empty($_REQUEST['arg']) && $_REQUEST['arg']=='media' &&
|
||||
!empty($_REQUEST['n'])) {
|
||||
$data['CURRENT_LOCALE_TAG'] = $locale_tag;
|
||||
$this->component("social")->mediaWiki($data, C\PUBLIC_GROUP_ID,
|
||||
$data['PAGE_ID'], $sub_path);
|
||||
} else if (isset($head_info['page_type'])) {
|
||||
if ($head_info['page_type'] == 'media_list') {
|
||||
$data['GROUP']['GROUP_ID'] = C\PUBLIC_GROUP_ID;
|
||||
$data['HEAD'] = $head_info;
|
||||
$data['PAGE_NAME'] = $page;
|
||||
$data['CAN_EDIT'] = false;
|
||||
$data['MODE'] = "static";
|
||||
$data['RESOURCE_FILTER'] =
|
||||
(isset($_REQUEST['resource_filter'])) ?
|
||||
substr($this->clean($_REQUEST['resource_filter'],
|
||||
'file_name'), 0, C\SHORT_TITLE_LEN) : "";
|
||||
$data['page_type'] = 'media_list';
|
||||
$data['RESOURCES_INFO'] =
|
||||
$group_model->getGroupPageResourceUrls(
|
||||
C\PUBLIC_GROUP_ID, $data['PAGE_ID'], $sub_path);
|
||||
$this->component("social")->sortWikiResources($data);
|
||||
} else if ($head_info['page_type'] == 'presentation') {
|
||||
$data['page_type'] = 'presentation';
|
||||
$data['INCLUDE_SCRIPTS'][] = "slidy";
|
||||
$data['INCLUDE_STYLES'][] = "slidy";
|
||||
}
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
/**
|
||||
* Used to read in a PUBLIC_GROUP_ID wiki page that will be presented
|
||||
* to non-logged in visitors to the site.
|
||||
*
|
||||
* @param string $page_name name of file less extension to read in
|
||||
* @return string text of page
|
||||
*/
|
||||
public function getPage($page_name)
|
||||
{
|
||||
$group_model = $this->model("group");
|
||||
$locale_tag = L\getLocaleTag();
|
||||
$page_info = $group_model->getPageInfoByName(
|
||||
C\PUBLIC_GROUP_ID, $page_name, $locale_tag, "read");
|
||||
$page_string = isset($page_info["PAGE"]) ? $page_info["PAGE"] : "";
|
||||
if (!$page_string && $locale_tag != C\DEFAULT_LOCALE) {
|
||||
//fallback to default locale for translation
|
||||
$page_info = $group_model->getPageInfoByName(
|
||||
C\PUBLIC_GROUP_ID, $page_name, C\DEFAULT_LOCALE, "read");
|
||||
$page_string = $page_info["PAGE"];
|
||||
}
|
||||
$data['CONTROLLER'] = "static";
|
||||
return $page_string;
|
||||
}
|
||||
}
|
1166
src/controllers/components/AccountaccessComponent.php
Normal file
1166
src/controllers/components/AccountaccessComponent.php
Normal file
File diff suppressed because it is too large
Load diff
484
src/controllers/components/AdvertisementComponent.php
Normal file
484
src/controllers/components/AdvertisementComponent.php
Normal file
|
@ -0,0 +1,484 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Pushkar Umaranikar
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers\components;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\UrlParser;
|
||||
|
||||
/**
|
||||
* Component of the Yioop control panel used to handle activitys for
|
||||
* managing advertisements. i.e., create advertisement, activate/
|
||||
* deactivate advertisement, edit advertisement.It is used by AdminController
|
||||
*
|
||||
* @author Pushkar Umaranikar
|
||||
*/
|
||||
class AdvertisementComponent extends Component
|
||||
{
|
||||
/**
|
||||
* Used to manage the purchase and storage of advertising credits
|
||||
*
|
||||
* @return array $data field variables necessary for display of view
|
||||
*/
|
||||
public function manageCredits()
|
||||
{
|
||||
$parent = $this->parent;
|
||||
$credit_model = $parent->model("credit");
|
||||
$signin_model = $parent->model("signin");
|
||||
$user_model = $parent->model("user");
|
||||
$data = [];
|
||||
$data['SCRIPT'] = "";
|
||||
$data['MESSAGE'] = "";
|
||||
$data["ELEMENT"] = "managecredits";
|
||||
$data["AMOUNTS"] = [0 => tl('advertisement_component_credit_amounts'),
|
||||
"10" => tl('advertisement_component_ten_in_credits'),
|
||||
"20" => tl('advertisement_component_twenty_in_credits'),
|
||||
"50" => tl('advertisement_component_fifty_in_credits'),
|
||||
"100" => tl('advertisement_component_hundred_in_credits'),
|
||||
];
|
||||
$data['COST_AMOUNTS'] = [
|
||||
10 => 1000, 20 => 2000, 50 => 5000, 100 => 10000,
|
||||
];
|
||||
$data['MONTHS'] = [ 0 => tl('advertisement_component_month'),
|
||||
"01" => "01", "02" => "02", "03" => "03",
|
||||
"04" => "04", "05" => "05", "06" => "06", "07" => "07",
|
||||
"08" => "08", "09" => "09", "10" => "10", "11" => "11",
|
||||
"12" => "12"
|
||||
];
|
||||
$user_id = $_SESSION['USER_ID'];
|
||||
$username = $signin_model->getUserName($user_id);
|
||||
$data["USER"] = $user_model->getUser($username);
|
||||
$data["USER_ID"] = $user_id;
|
||||
$current_year = date('Y');
|
||||
$data['YEARS'] = [ 0 => tl('advertisement_component_year')];
|
||||
for ( $year = $current_year; $year < $current_year + 20; $year++ ) {
|
||||
$data['YEARS'][$year] = $year;
|
||||
}
|
||||
$arg = (isset($_REQUEST['arg'])) ? $parent->clean($_REQUEST['arg'],
|
||||
"string") : "";
|
||||
$num_dollars = (isset($_REQUEST['NUM_DOLLARS']) &&
|
||||
isset($data['COST_AMOUNTS'][$_REQUEST['NUM_DOLLARS']])) ?
|
||||
$_REQUEST['NUM_DOLLARS'] : 0;
|
||||
$data['BALANCE'] = $credit_model->getCreditBalance($user_id);
|
||||
if (C\CreditConfig::isActive() && (!($user_id == C\ROOT_ID &&
|
||||
C\ALLOW_FREE_ROOT_CREDIT_PURCHASE))) {
|
||||
$data["INCLUDE_SCRIPTS"][] = 'credit';
|
||||
$ad_script_found = false;
|
||||
for ($i = C\YIOOP_VERSION; $i >= C\MIN_AD_VERSION; $i--) {
|
||||
$get_credit_token_initialize_script =
|
||||
"FN" . md5(UrlParser::getBaseDomain(C\NAME_SERVER) .
|
||||
$i . "getCreditTokenInitializeScript");
|
||||
if (method_exists( C\NS_CONFIGS . "CreditConfig",
|
||||
$get_credit_token_initialize_script)) {
|
||||
$ad_script_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ($ad_script_found) {
|
||||
$data['SCRIPT'] .=
|
||||
C\CreditConfig::$get_credit_token_initialize_script();
|
||||
} else {
|
||||
$data['DISPLAY_MESSAGE'] =
|
||||
tl('advertisement_component_script_failure');
|
||||
}
|
||||
}
|
||||
switch ($arg)
|
||||
{
|
||||
case "purchaseCredits":
|
||||
$message = "";
|
||||
if ($num_dollars <= 0) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_invalid_credit_quantity'),
|
||||
[]);
|
||||
}
|
||||
/* string to translate stored in column of 32 chars
|
||||
so not writing advertisement_component
|
||||
*/
|
||||
$strings_to_translate_for_model = [
|
||||
tl('advertisement_buy_credits'),
|
||||
tl('advertisement_init_ledger')];
|
||||
$token = $parent->clean($_REQUEST['CREDIT_TOKEN'], "string");
|
||||
if (!($user_id == C\ROOT_ID &&
|
||||
C\ALLOW_FREE_ROOT_CREDIT_PURCHASE)) {
|
||||
$is_active = C\CreditConfig::isActive();
|
||||
if ($is_active && empty($token)) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_credit_token_empty'),
|
||||
[]);
|
||||
}
|
||||
if ($is_active && !C\CreditConfig::charge(
|
||||
$num_dollars, $parent->clean(
|
||||
$_REQUEST['CREDIT_TOKEN'], "string"), $message)) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_processing_error',
|
||||
$message), []);
|
||||
}
|
||||
}
|
||||
$credit_model->updateCredits($user_id,
|
||||
$data['COST_AMOUNTS'][$num_dollars],
|
||||
'advertisement_buy_credits');
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_credits_purchased'),
|
||||
[]);
|
||||
break;
|
||||
}
|
||||
$search_array = [["timestamp", "", "", "DESC"]];
|
||||
$parent->pagingLogic($data, $credit_model, "TRANSACTIONS",
|
||||
C\DEFAULT_ADMIN_PAGING_NUM, $search_array, "",
|
||||
["USER_ID" => $user_id]);
|
||||
return $data;
|
||||
}
|
||||
/**
|
||||
* Used to handle the Create, Edit and Activation of Advertisements
|
||||
*
|
||||
* @return array $data field variables necessary for display of view
|
||||
*/
|
||||
public function manageAdvertisements()
|
||||
{
|
||||
$parent = $this->parent;
|
||||
$signin_model = $parent->model("signin");
|
||||
$user_model = $parent->model("user");
|
||||
$role_model = $parent->model('role');
|
||||
$advertisement_model = $parent->model("advertisement");
|
||||
$credit_model = $parent->model("credit");
|
||||
$data = [];
|
||||
$data['DURATIONS'] = [ 0 => tl('advertisement_component_num_days'),
|
||||
1 => tl('advertisement_component_one_day'),
|
||||
7 => tl('advertisement_component_seven_days'),
|
||||
30 => tl('advertisement_component_thirty_days'),
|
||||
90 => tl('advertisement_component_ninety_days'),
|
||||
180 => tl('advertisement_component_one_eighty_days'),
|
||||
];
|
||||
$request_field_types = [
|
||||
'context' => 'string',
|
||||
"start_row" => 'int', "num_show" => 'int', "end_row" => 'int',
|
||||
"NAME" => 'string', "DESTINATION" => 'web-url',
|
||||
"DESCRIPTION" => 'string', "KEYWORDS" => 'string',
|
||||
"BUDGET" => 'int', "DURATION" => array_keys($data['DURATIONS']),
|
||||
'id' => 'int', 'status' => 'int'];
|
||||
$request_fields = array_keys($request_field_types);
|
||||
$data['MONTHS'] = [ 0 => tl('advertisement_component_month'),
|
||||
"01" => "01", "02" => "02", "03" => "03",
|
||||
"04" => "04", "05" => "05", "06" => "06", "07" => "07",
|
||||
"08" => "08", "09" => "09", "10" => "10", "11" => "11",
|
||||
"12" => "12"
|
||||
];
|
||||
$current_year = date('Y');
|
||||
$data['YEARS'] = [ 0 => tl('advertisement_component_year')];
|
||||
for ( $year = $current_year; $year < $current_year + 20; $year++ ) {
|
||||
$data['YEARS'][$year] = $year;
|
||||
}
|
||||
$data['SCRIPT'] = "";
|
||||
$data['MESSAGE'] = "";
|
||||
$data["ELEMENT"] = "manageadvertisements";
|
||||
$data['FORM_TYPE'] = "addadvertisement";
|
||||
$data['DURATION'] = 0;
|
||||
foreach ($request_field_types as $field => $type) {
|
||||
if (isset($_REQUEST[$field])) {
|
||||
$data[$field] = $parent->clean($_REQUEST[$field], $type);
|
||||
}
|
||||
}
|
||||
if (isset($_REQUEST['EDIT_AD'])) {
|
||||
unset($_REQUEST['CALCULATE']);
|
||||
unset($_REQUEST['arg']);
|
||||
}
|
||||
if (isset($_REQUEST['CALCULATE']) || (isset($_REQUEST['arg']) &&
|
||||
$_REQUEST['arg'] == "addadvertisement")) {
|
||||
if (empty($_REQUEST['NAME']) ||
|
||||
empty($_REQUEST['DESCRIPTION']) ||
|
||||
empty($_REQUEST['DESTINATION'])) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_fields_cannot_be_empty'),
|
||||
array_merge([$_REQUEST['arg']], $request_fields));
|
||||
}
|
||||
if (!isset($_REQUEST['DURATION']) || $_REQUEST['DURATION'] == 0) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_duration_cannot_be_empty'),
|
||||
array_merge([$_REQUEST['arg']], $request_fields));
|
||||
}
|
||||
if (empty($_REQUEST['KEYWORDS'])) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_enter_keywords'),
|
||||
array_merge([$_REQUEST['arg']], $request_fields));
|
||||
}
|
||||
$data['START_DATE'] = date(C\AD_DATE_FORMAT);
|
||||
$_REQUEST['START_DATE'] = $data['START_DATE'];
|
||||
$start_date = strtotime($data['START_DATE']);
|
||||
$data['END_DATE'] = date(C\AD_DATE_FORMAT,
|
||||
$start_date + (($data['DURATION'] - 1) * C\ONE_DAY));
|
||||
$_REQUEST['END_DATE'] = $data['END_DATE'];
|
||||
$this->initializeAdKeywords($data, $start_date, $data['DURATION']);
|
||||
}
|
||||
$user_id = $_SESSION['USER_ID'];
|
||||
$is_admin = $role_model->checkUserRole($user_id, C\ADMIN_ROLE);
|
||||
$data['HAS_ADMIN_ROLE'] = $is_admin;
|
||||
$username = $signin_model->getUserName($user_id);
|
||||
$data["USER"] = $user_model->getUser($username);
|
||||
$data["USER_ID"] = $user_id;
|
||||
$data['PAGING'] = "";
|
||||
$search_array = [];
|
||||
$arg = (isset($_REQUEST['arg'])) ? $parent->clean($_REQUEST['arg'],
|
||||
"string") : "";
|
||||
$data['BALANCE'] = $credit_model->getCreditBalance($user_id);
|
||||
switch ($arg)
|
||||
{
|
||||
case "addadvertisement":
|
||||
if ( isset($_REQUEST['PURCHASE'])) {
|
||||
$advertisement = [];
|
||||
$advertisement['USER_ID'] = $user_id;
|
||||
$fields = ["NAME", "DESCRIPTION",
|
||||
"DESTINATION", "BUDGET", "KEYWORDS",
|
||||
"START_DATE", "END_DATE"];
|
||||
foreach ($fields as $field) {
|
||||
if (isset($_REQUEST[$field])) {
|
||||
$advertisement[$field] = $data[$field];
|
||||
}
|
||||
}
|
||||
if (empty($_REQUEST['KEYWORDS'])) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_enter_keywords'),
|
||||
array_merge(['arg'], $request_fields));
|
||||
}
|
||||
$ad_start_date = $data['START_DATE'];
|
||||
if ($advertisement["BUDGET"] < $data['AD_MIN_BID']) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_bid_too_low'),
|
||||
array_merge(['arg'], $request_fields));
|
||||
}
|
||||
if ($data['BALANCE'] < $advertisement["BUDGET"]) {
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_too_few_credits'),
|
||||
array_merge(['arg'], $request_fields));
|
||||
}
|
||||
$message = "";
|
||||
$strings_to_translate_for_model =
|
||||
[tl('advertisement_buy_ad')];
|
||||
$advertisement_model->addAdvertisement($advertisement,
|
||||
$data["AD_KEYWORDS"], $data['AD_MIN_BID'], $user_id);
|
||||
$credit_model->updateCredits($user_id,
|
||||
-$data["BUDGET"],
|
||||
'advertisement_buy_ad');
|
||||
$preserve = [];
|
||||
if (!empty($_REQUEST['context'])) {
|
||||
$_REQUEST['arg'] = 'search';
|
||||
$preserve[] = 'arg';
|
||||
}
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_ad_created'),
|
||||
array_merge($preserve,
|
||||
["start_row", "num_show", "end_row"]));
|
||||
}
|
||||
break;
|
||||
case "changestatus":
|
||||
if (isset($_REQUEST['id'])) {
|
||||
$ad = $advertisement_model->getAdvertisementById(
|
||||
$data['id']);
|
||||
if (empty($ad) || ($user_id != $ad['USER_ID'] &&
|
||||
!$is_admin) ) {
|
||||
break;
|
||||
}
|
||||
$user_ad_statuses = [C\ADVERTISEMENT_ACTIVE_STATUS,
|
||||
C\ADVERTISEMENT_DEACTIVATED_STATUS];
|
||||
$admin_ad_statuses = [C\ADVERTISEMENT_ACTIVE_STATUS,
|
||||
C\ADVERTISEMENT_SUSPENDED_STATUS];
|
||||
if ($user_id == $ad['USER_ID'] && !in_array(
|
||||
$data['status'], $user_ad_statuses)) {
|
||||
break;
|
||||
} else if ($user_id != $ad['USER_ID'] &&
|
||||
$is_admin && !in_array(
|
||||
$data['status'], $admin_ad_statuses)) {
|
||||
break;
|
||||
}
|
||||
$result = $advertisement_model->setAdvertisementStatus(
|
||||
$data['id'], $data['status']);
|
||||
if ($result) {
|
||||
$preserve = ["start_row", "end_row", "num_show"];
|
||||
if (!empty($_REQUEST['context'])) {
|
||||
$_REQUEST['arg'] = 'search';
|
||||
$preserve[] = 'arg';
|
||||
}
|
||||
return $parent->redirectWithMessage(tl(
|
||||
tl('advertisement_component_status_changed')),
|
||||
array_merge($preserve, $request_fields));
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "editadvertisement":
|
||||
$data["FORM_TYPE"] = "editadvertisement";
|
||||
$update = false;
|
||||
if (isset($_REQUEST['save'])) {
|
||||
$update = true;
|
||||
}
|
||||
if (isset($_REQUEST['id'])) {
|
||||
$ad = $advertisement_model->getAdvertisementById(
|
||||
$data['id']);
|
||||
$ad_fields = ["NAME", "DESTINATION",
|
||||
"DESCRIPTION","BUDGET","KEYWORDS",
|
||||
"START_DATE", 'END_DATE'];
|
||||
if (!empty($ad) && ($user_id == $ad['USER_ID'] ||
|
||||
$is_admin)) {
|
||||
foreach ($ad_fields as $field) {
|
||||
$data[$field] = isset($data[$field]) ?
|
||||
$data[$field] : $ad[$field];
|
||||
}
|
||||
if ($is_admin) {
|
||||
$data['AD_USER_NAME'] = $user_model->getUsername(
|
||||
$ad['USER_ID']);
|
||||
}
|
||||
if ($update) {
|
||||
$updated_advertisement = [];
|
||||
$ad_update_fields = ["NAME",
|
||||
"DESCRIPTION","DESTINATION"];
|
||||
foreach ($ad_update_fields as $field) {
|
||||
if (isset($_REQUEST[$field])) {
|
||||
$updated_advertisement[$field] =
|
||||
$data[$field];
|
||||
}
|
||||
}
|
||||
$advertisement_model->updateAdvertisement(
|
||||
$updated_advertisement, $data['id']);
|
||||
foreach ($request_fields as $field) {
|
||||
unset($data[$field]);
|
||||
}
|
||||
unset($data['START_DATE']);
|
||||
unset($data['END_DATE']);
|
||||
return $parent->redirectWithMessage(
|
||||
tl('advertisement_component_ad_updated'),
|
||||
["arg", "id", "start_row", "num_show",
|
||||
"end_row", "context"]);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "search":
|
||||
$data["FORM_TYPE"] = "search";
|
||||
$search_array =
|
||||
$parent->tableSearchRequestHandler($data,
|
||||
"manageAdvertisements",
|
||||
['name', 'description', 'destination', 'keywords',
|
||||
'budget', 'start_date', 'end_date']);
|
||||
if (empty($_SESSION['LAST_SEARCH']['manageAdvertisements']) ||
|
||||
isset($_REQUEST['name'])) {
|
||||
$_SESSION['LAST_SEARCH']['manageAdvertisements'] =
|
||||
$_SESSION['SEARCH']['manageAdvertisements'];
|
||||
unset($_SESSION['SEARCH']['manageAdvertisements']);
|
||||
} else {
|
||||
$default_search = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if ($search_array == [] || !empty($default_search)) {
|
||||
if (!empty($_SESSION['LAST_SEARCH']['manageAdvertisements'])) {
|
||||
if (!empty($_REQUEST['arg']) && $_REQUEST['arg'] == 'search') {
|
||||
$search_array =
|
||||
$parent->restoreLastSearchFromSession($data,
|
||||
'manageAdvertisements');
|
||||
} else if (!empty($_REQUEST['context'])) {
|
||||
$search_array =
|
||||
$_SESSION['LAST_SEARCH']['manageAdvertisements'][
|
||||
'SEARCH_ARRAY'];
|
||||
$data['PAGING'] = $_SESSION['LAST_SEARCH'][
|
||||
'manageAdvertisements']['PAGING'];
|
||||
}
|
||||
}
|
||||
if ($search_array == []) {
|
||||
$search_array[] = ["id", "", "", "DESC"];
|
||||
}
|
||||
}
|
||||
if (!C\MOBILE) {
|
||||
$data['SCRIPT'] .= "\npreview(elt('ad-name'))\n" .
|
||||
"preview(elt('ad-description'))\n".
|
||||
"preview(elt('ad-destination'), 'ad-name')\n";
|
||||
}
|
||||
$parent->pagingLogic($data, $advertisement_model, "ADVERTISEMENTS",
|
||||
C\DEFAULT_ADMIN_PAGING_NUM, $search_array, "",
|
||||
["USER_ID" => $user_id, "ADMIN" => $is_admin]);
|
||||
return $data;
|
||||
}
|
||||
/**
|
||||
* Sets up the $data['AD_KEYWORD'] as an associative array of
|
||||
* (keyword, day) => bid_amounts based on min bid for that ad keyword on
|
||||
* that day. Set up $data['EXPENSIVE_KEYWORD'] as the most expensive
|
||||
* ad keyword for the dates in question and also sets up $data['AD_MIN_BID']
|
||||
* as the minimum bid required for the dates in question
|
||||
*
|
||||
* @param array &$data associative array of data used by the view to
|
||||
* draw itself
|
||||
* @param int $start_date state date in seconds since beginning of Unix
|
||||
* epoch
|
||||
* @param int $day_count number of days ad campaign will last
|
||||
*/
|
||||
public function initializeAdKeywords(&$data, $start_date, $day_count)
|
||||
{
|
||||
$parent = $this->parent;
|
||||
$keywords = explode("," , strtoupper($data['KEYWORDS']));
|
||||
array_walk($keywords, [C\NS_COMPONENTS .
|
||||
"AdvertisementComponent", "trim_value"]);
|
||||
$min_bid_reqd = 0;
|
||||
$expensive_bid = 0;
|
||||
foreach ($keywords as $keyword) {
|
||||
$date = date(C\AD_DATE_FORMAT, $start_date);
|
||||
$keyword_bid_amount = 0;
|
||||
for ($k = 0; $k < $day_count; $k++) {
|
||||
$bid_amount = $parent->model('advertisement')->getBidAmount(
|
||||
$keyword, $date);
|
||||
$half_bid = ceil($bid_amount/2);
|
||||
if ($bid_amount > C\AD_KEYWORD_INIT_BID ) {
|
||||
$min_bid_reqd += $half_bid;
|
||||
$data['AD_KEYWORDS'][$keyword][$date] =
|
||||
$half_bid;
|
||||
$keyword_bid_amount += $half_bid;
|
||||
} else {
|
||||
$min_bid_reqd += $bid_amount;
|
||||
$data['AD_KEYWORDS'][$keyword][$date] =
|
||||
$half_bid;
|
||||
$keyword_bid_amount += $half_bid;
|
||||
}
|
||||
$date = date(C\AD_DATE_FORMAT, strtotime($date .' +1 day'));
|
||||
}
|
||||
if ($keyword_bid_amount >= $expensive_bid) {
|
||||
$expensive_bid = $keyword_bid_amount;
|
||||
$data['EXPENSIVE_KEYWORD'] = $keyword;
|
||||
}
|
||||
}
|
||||
$data['AD_MIN_BID'] = $min_bid_reqd;
|
||||
}
|
||||
/**
|
||||
* Trim white spaces callback for array_walk
|
||||
*
|
||||
* @param string& $value string to remove initial and trailing whitespace
|
||||
* from
|
||||
*/
|
||||
public function trim_value(&$value)
|
||||
{
|
||||
$value = trim($value);
|
||||
}
|
||||
}
|
85
src/controllers/components/Component.php
Normal file
85
src/controllers/components/Component.php
Normal file
|
@ -0,0 +1,85 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\controllers\components;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/**
|
||||
* Translate the supplied arguments into the current locale.
|
||||
*
|
||||
* This function is a convenience copy of the same function
|
||||
* @see seekquarry\yioop\library\tl() to this subnamespace
|
||||
*
|
||||
* @param string string_identifier identifier to be translated
|
||||
* @param mixed additional_args used for interpolation in translated string
|
||||
* @return string translated string
|
||||
*/
|
||||
function tl()
|
||||
{
|
||||
return call_user_func_array(C\NS_LIB . "tl", func_get_args());
|
||||
}
|
||||
/**
|
||||
* shorthand for echo
|
||||
*
|
||||
* @param string $text string to send to the current output
|
||||
*/
|
||||
function e($text)
|
||||
{
|
||||
echo $text;
|
||||
}
|
||||
/**
|
||||
* Base component class for all components on
|
||||
* the SeekQuarry site. A component consists of a collection of
|
||||
* activities and their auxiliary methods that can be used by a controller
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class Component
|
||||
{
|
||||
/**
|
||||
* Reference to the controller this component lives on
|
||||
*
|
||||
* @var object
|
||||
*/
|
||||
public $parent = null;
|
||||
|
||||
/**
|
||||
* Sets up this component by storing in its parent field a reference to
|
||||
* controller this component lives on
|
||||
*
|
||||
* @param object $parent_controller reference to the controller this
|
||||
* component lives on
|
||||
*/
|
||||
public function __construct($parent_controller)
|
||||
{
|
||||
$this->parent = $parent_controller;
|
||||
}
|
||||
}
|
2047
src/controllers/components/CrawlComponent.php
Normal file
2047
src/controllers/components/CrawlComponent.php
Normal file
File diff suppressed because it is too large
Load diff
4021
src/controllers/components/SocialComponent.php
Normal file
4021
src/controllers/components/SocialComponent.php
Normal file
File diff suppressed because it is too large
Load diff
1284
src/controllers/components/SystemComponent.php
Normal file
1284
src/controllers/components/SystemComponent.php
Normal file
File diff suppressed because it is too large
Load diff
215
src/css/editor.css
Normal file
215
src/css/editor.css
Normal file
|
@ -0,0 +1,215 @@
|
|||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Eswara Rajesh Pinapala
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
/*
|
||||
editor.css
|
||||
Stylesheet for text-area editor associated with editing wiki pages
|
||||
*/
|
||||
.wiki-editor
|
||||
{
|
||||
display: block;
|
||||
margin: 0 auto;
|
||||
}
|
||||
.wiki-editor div
|
||||
{
|
||||
padding: 5px;
|
||||
}
|
||||
.wiki-buttons
|
||||
{
|
||||
margin-bottom:2px;
|
||||
}
|
||||
.wiki-buttons input
|
||||
{
|
||||
border-style: solid;
|
||||
cursor: pointer;
|
||||
display: inline-block;
|
||||
height: 36px;
|
||||
vertical-align: top;
|
||||
width: 36px;
|
||||
}
|
||||
.wiki-buttons select
|
||||
{
|
||||
background: lightgray;
|
||||
border: 1px solid #EEE;
|
||||
font-size:14pt;
|
||||
height:36px;
|
||||
margin-top: 2px;
|
||||
}
|
||||
|
||||
.wiki-popup-prompt
|
||||
{
|
||||
background-color:gray;
|
||||
display: none;
|
||||
height:100%;
|
||||
left: 0;
|
||||
right:0;
|
||||
opacity:0.95;
|
||||
position:fixed;
|
||||
top: 0;
|
||||
width:100%;
|
||||
z-index:100;
|
||||
}
|
||||
|
||||
.wiki-popup-content
|
||||
{
|
||||
background-color: white;
|
||||
border: 16px solid #8A4;
|
||||
height: 3in;
|
||||
left: 20%;
|
||||
right: 20%;
|
||||
overflow: auto;
|
||||
padding: 16px;
|
||||
position:fixed;
|
||||
top: 20%;
|
||||
width: 5in;
|
||||
z-index:101;
|
||||
|
||||
}
|
||||
.wiki-popup-content h2
|
||||
{
|
||||
padding:30px;
|
||||
}
|
||||
.wiki-popup-content div
|
||||
{
|
||||
padding:15px;
|
||||
}
|
||||
.wiki-popup-content select
|
||||
{
|
||||
font-size: 18pt;
|
||||
}
|
||||
.wiki-popup-content table
|
||||
{
|
||||
margin: auto;
|
||||
}
|
||||
|
||||
.wikibtn-bold
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-bold.png) no-repeat;
|
||||
}
|
||||
.wikibtn-underline
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-underline.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-bullets
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-bullets.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-heading
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-heading.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-hr
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-hr.png) no-repeat;
|
||||
}
|
||||
.wikibtn-hyperlink
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-hyperlink.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-italic
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-italic.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-nowiki
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-nowiki.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-numbers
|
||||
{
|
||||
background:url(../resources/wiki_button_images/wikibtn-numbers.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-strike
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-strike.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-table
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-table.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-search-widget
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-search-widget.png)
|
||||
no-repeat;
|
||||
}
|
||||
|
||||
.wikibtn-slide
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-slide.png)
|
||||
no-repeat;
|
||||
}
|
||||
|
||||
.wikibtn-definitionlist
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-definitionlist.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-centeraligned
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-centeraligned.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-rightaligned
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-rightaligned.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-leftaligned
|
||||
{
|
||||
background: url(../resources/wiki_button_images/wikibtn-leftaligned.png)
|
||||
no-repeat;
|
||||
}
|
||||
.wikibtn-bold:hover,
|
||||
.wikibtn-bullets:hover,
|
||||
.wikibtn-heading:hover,
|
||||
.wikibtn-hr:hover,
|
||||
.wikibtn-hyperlink:hover,
|
||||
.wikibtn-italic:hover,
|
||||
.wikibtn-nowiki:hover,
|
||||
.wikibtn-numbers:hover,
|
||||
.wikibtn-search-widget:hover,
|
||||
.wikibtn-strike:hover,
|
||||
.wikibtn-table:hover,
|
||||
.wikibtn-underline:hover,
|
||||
.wikibtn-slide:hover,
|
||||
.wikibtn-definitionlist:hover,
|
||||
.wikibtn-rightaligned:hover,
|
||||
.wikibtn-leftaligned:hover,
|
||||
.wikibtn-centeraligned:hover
|
||||
{
|
||||
background-position: 0 -36px;
|
||||
}
|
3127
src/css/search.css
Normal file
3127
src/css/search.css
Normal file
File diff suppressed because it is too large
Load diff
402
src/css/slidy.css
Normal file
402
src/css/slidy.css
Normal file
|
@ -0,0 +1,402 @@
|
|||
/* slidy.css
|
||||
|
||||
Copyright (c) 2005-2010 W3C (MIT, ERCIM, Keio), All Rights Reserved.
|
||||
W3C liability, trademark, document use and software licensing
|
||||
rules apply, see:
|
||||
|
||||
http://www.w3.org/Consortium/Legal/copyright-documents
|
||||
http://www.w3.org/Consortium/Legal/copyright-software
|
||||
*/
|
||||
body
|
||||
{
|
||||
margin: 0 0 0 0;
|
||||
padding: 0 0 0 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
color: black;
|
||||
background-color: white;
|
||||
font-family: "Gill Sans MT", "Gill Sans", GillSans, sans-serif;
|
||||
font-size: 14pt;
|
||||
}
|
||||
|
||||
div.toolbar {
|
||||
position: fixed; z-index: 200;
|
||||
top: auto; bottom: 0; left: 0; right: 0;
|
||||
height: 1.2em; text-align: right;
|
||||
padding-left: 1em;
|
||||
padding-right: 1em;
|
||||
font-size: 60%;
|
||||
color: red;
|
||||
background-color: rgb(240,240,240);
|
||||
border-top: solid 1px rgb(180,180,180);
|
||||
}
|
||||
|
||||
div.toolbar span.copyright {
|
||||
color: black;
|
||||
margin-left: 0.5em;
|
||||
}
|
||||
|
||||
div.initial_prompt {
|
||||
position: absolute;
|
||||
z-index: 1000;
|
||||
bottom: 1.2em;
|
||||
width: 100%;
|
||||
background-color: rgb(200,200,200);
|
||||
opacity: 0.35;
|
||||
background-color: rgba(200,200,200, 0.35);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
div.initial_prompt p.help {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
div.initial_prompt p.close {
|
||||
text-align: right;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
div.slidy_toc {
|
||||
position: absolute;
|
||||
z-index: 300;
|
||||
width: 60%;
|
||||
max-width: 30em;
|
||||
height: 30em;
|
||||
overflow: auto;
|
||||
top: auto;
|
||||
right: auto;
|
||||
left: 4em;
|
||||
bottom: 4em;
|
||||
padding: 1em;
|
||||
background: rgb(240,240,240);
|
||||
border-style: solid;
|
||||
border-width: 2px;
|
||||
font-size: 60%;
|
||||
}
|
||||
|
||||
div.slidy_toc .toc_heading {
|
||||
text-align: center;
|
||||
width: 100%;
|
||||
margin: 0;
|
||||
margin-bottom: 1em;
|
||||
border-bottom-style: solid;
|
||||
border-bottom-color: rgb(180,180,180);
|
||||
border-bottom-width: 1px;
|
||||
}
|
||||
|
||||
div.slide {
|
||||
z-index: 20;
|
||||
margin: 0 0 0 0;
|
||||
padding-top: 0;
|
||||
padding-bottom: 0;
|
||||
padding-left: 20px;
|
||||
padding-right: 20px;
|
||||
border-width: 0;
|
||||
clear: both;
|
||||
top: 0;
|
||||
bottom: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
line-height: 120%;
|
||||
background-color: transparent;
|
||||
}
|
||||
|
||||
div.background {
|
||||
display: none;
|
||||
}
|
||||
|
||||
div.handout {
|
||||
margin-left: 20px;
|
||||
margin-right: 20px;
|
||||
}
|
||||
|
||||
div.slide.titlepage {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
div.slide.titlepage h1 {
|
||||
padding-top: 10%;
|
||||
margin-right: 0;
|
||||
}
|
||||
|
||||
div.slide h1 {
|
||||
padding-left: 0;
|
||||
padding-right: 20pt;
|
||||
padding-top: 4pt;
|
||||
padding-bottom: 4pt;
|
||||
margin-top: 0;
|
||||
margin-left: 0;
|
||||
margin-right: 60pt;
|
||||
margin-bottom: 0.5em;
|
||||
display: block;
|
||||
font-size: 160%;
|
||||
line-height: 1.2em;
|
||||
background: transparent;
|
||||
}
|
||||
|
||||
@media screen and (max-device-width: 1024px)
|
||||
{
|
||||
div.slide { font-size: 100%; }
|
||||
}
|
||||
|
||||
@media screen and (max-device-width: 800px)
|
||||
{
|
||||
div.slide { font-size: 200%; }
|
||||
div.slidy_toc {
|
||||
top: 1em;
|
||||
left: 1em;
|
||||
right: auto;
|
||||
width: 80%;
|
||||
font-size: 180%;
|
||||
}
|
||||
}
|
||||
|
||||
div.toc-heading {
|
||||
width: 100%;
|
||||
border-bottom: solid 1px rgb(180,180,180);
|
||||
margin-bottom: 1em;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
img {
|
||||
image-rendering: optimize-quality;
|
||||
}
|
||||
|
||||
pre {
|
||||
font-size: 80%;
|
||||
font-weight: bold;
|
||||
line-height: 120%;
|
||||
padding-top: 0.2em;
|
||||
padding-bottom: 0.2em;
|
||||
padding-left: 1em;
|
||||
padding-right: 1em;
|
||||
border-style: solid;
|
||||
border-left-width: 1em;
|
||||
border-top-width: thin;
|
||||
border-right-width: thin;
|
||||
border-bottom-width: thin;
|
||||
border-color: #95ABD0;
|
||||
color: #00428C;
|
||||
background-color: #E4E5E7;
|
||||
}
|
||||
|
||||
li pre { margin-left: 0; }
|
||||
|
||||
blockquote { font-style: italic }
|
||||
|
||||
img { background-color: transparent }
|
||||
|
||||
p.copyright { font-size: smaller }
|
||||
|
||||
.center { text-align: center }
|
||||
.footnote { font-size: smaller; margin-left: 2em; }
|
||||
|
||||
a img { border-width: 0; border-style: none }
|
||||
|
||||
a:visited { color: navy }
|
||||
a:link { color: navy }
|
||||
a:hover { color: red; text-decoration: underline }
|
||||
a:active { color: red; text-decoration: underline }
|
||||
|
||||
a {text-decoration: none}
|
||||
.toolbar a:link {color: blue}
|
||||
.toolbar a:visited {color: blue}
|
||||
.toolbar a:active {color: red}
|
||||
.toolbar a:hover {color: red}
|
||||
|
||||
ul { list-style-type: square; }
|
||||
ul ul { list-style-type: disc; }
|
||||
ul ul ul { list-style-type: circle; }
|
||||
ul ul ul ul { list-style-type: disc; }
|
||||
li { margin-left: 0.5em; margin-top: 0.5em; }
|
||||
li li { font-size: 85%; font-style: italic }
|
||||
li li li { font-size: 85%; font-style: normal }
|
||||
|
||||
div dt
|
||||
{
|
||||
margin-left: 0;
|
||||
margin-top: 1em;
|
||||
margin-bottom: 0.5em;
|
||||
font-weight: bold;
|
||||
}
|
||||
div dd
|
||||
{
|
||||
margin-left: 2em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
|
||||
p,pre,ul,ol,blockquote,h2,h3,h4,h5,h6,dl,table {
|
||||
margin-left: 1em;
|
||||
margin-right: 1em;
|
||||
}
|
||||
|
||||
p.subhead { font-weight: bold; margin-top: 2em; }
|
||||
|
||||
.smaller { font-size: smaller }
|
||||
.bigger { font-size: 130% }
|
||||
|
||||
td,th { padding: 0.2em }
|
||||
|
||||
ul {
|
||||
margin: 0.5em 1.5em 0.5em 1.5em;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
ol {
|
||||
margin: 0.5em 1.5em 0.5em 1.5em;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
ul { list-style-type: square; }
|
||||
ul ul { list-style-type: disc; }
|
||||
ul ul ul { list-style-type: circle; }
|
||||
ul ul ul ul { list-style-type: disc; }
|
||||
|
||||
ul li {
|
||||
list-style: square;
|
||||
margin: 0.1em 0em 0.6em 0;
|
||||
padding: 0 0 0 0;
|
||||
line-height: 140%;
|
||||
}
|
||||
|
||||
ol li {
|
||||
margin: 0.1em 0em 0.6em 1.5em;
|
||||
padding: 0 0 0 0px;
|
||||
line-height: 140%;
|
||||
list-style-type: decimal;
|
||||
}
|
||||
|
||||
li ul li {
|
||||
font-size: 85%;
|
||||
font-style: italic;
|
||||
list-style-type: disc;
|
||||
background: transparent;
|
||||
padding: 0 0 0 0;
|
||||
}
|
||||
li li ul li {
|
||||
font-size: 85%;
|
||||
font-style: normal;
|
||||
list-style-type: circle;
|
||||
background: transparent;
|
||||
padding: 0 0 0 0;
|
||||
}
|
||||
li li li ul li {
|
||||
list-style-type: disc;
|
||||
background: transparent;
|
||||
padding: 0 0 0 0;
|
||||
}
|
||||
|
||||
li ol li {
|
||||
list-style-type: decimal;
|
||||
}
|
||||
|
||||
|
||||
li li ol li {
|
||||
list-style-type: decimal;
|
||||
}
|
||||
|
||||
/*
|
||||
setting class="outline on ol or ul makes it behave as an
|
||||
ouline list where blocklevel content in li elements is
|
||||
hidden by default and can be expanded or collapsed with
|
||||
mouse click. Set class="expand" on li to override default
|
||||
*/
|
||||
|
||||
ol.outline li:hover { cursor: pointer }
|
||||
ol.outline li.nofold:hover { cursor: default }
|
||||
|
||||
ul.outline li:hover { cursor: pointer }
|
||||
ul.outline li.nofold:hover { cursor: default }
|
||||
|
||||
ol.outline { list-style:decimal; }
|
||||
ol.outline ol { list-style-type:lower-alpha }
|
||||
|
||||
ol.outline li.nofold {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/nofold-dim.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
ol.outline li.unfolded {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/fold-dim.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
ol.outline li.folded {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/unfold-dim.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
ol.outline li.unfolded:hover {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/fold.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
ol.outline li.folded:hover {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/unfold.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
|
||||
ul.outline li.nofold {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/nofold-dim.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
ul.outline li.unfolded {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/fold-dim.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
ul.outline li.folded {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/unfold-dim.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
ul.outline li.unfolded:hover {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/fold.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
ul.outline li.folded:hover {
|
||||
padding: 0 0 0 20px;
|
||||
background: transparent url(../graphics/unfold.gif) no-repeat 0px 0.5em;
|
||||
}
|
||||
|
||||
/* for slides with class "title" in table of contents */
|
||||
a.titleslide { font-weight: bold; font-style: italic }
|
||||
|
||||
/*
|
||||
hide images for work around for save as bug
|
||||
where browsers fail to save images used by CSS
|
||||
*/
|
||||
img.hidden { display: none; visibility: hidden }
|
||||
div.initial_prompt { display: none; visibility: hidden }
|
||||
|
||||
div.slide {
|
||||
visibility: visible;
|
||||
position: inherit;
|
||||
}
|
||||
div.handout {
|
||||
border-top-style: solid;
|
||||
border-top-width: thin;
|
||||
border-top-color: black;
|
||||
}
|
||||
|
||||
@media screen {
|
||||
.hidden { display: none; visibility: visible }
|
||||
|
||||
div.slide.hidden { display: block; visibility: visible }
|
||||
div.handout.hidden { display: block; visibility: visible }
|
||||
div.background { display: none; visibility: hidden }
|
||||
body.single_slide div.initial_prompt { display: block; visibility: visible }
|
||||
body.single_slide div.background { display: block; visibility: visible }
|
||||
body.single_slide div.background.hidden { display: none; visibility: hidden }
|
||||
body.single_slide .invisible { visibility: hidden }
|
||||
body.single_slide .hidden { display: none; visibility: hidden }
|
||||
body.single_slide div.slide { position: absolute }
|
||||
body.single_slide div.handout { display: none; visibility: hidden }
|
||||
}
|
||||
|
||||
@media print {
|
||||
.hidden { display: block; visibility: visible }
|
||||
|
||||
div.slide pre { font-size: 60%; padding-left: 0.5em; }
|
||||
div.toolbar { display: none; visibility: hidden; }
|
||||
div.slidy_toc { display: none; visibility: hidden; }
|
||||
div.background { display: none; visibility: hidden; }
|
||||
div.slide { page-break-before: always }
|
||||
/* :first-child isn't reliable for print media */
|
||||
div.slide.first-slide { page-break-before: avoid }
|
||||
}
|
BIN
src/data/default.db
Normal file
BIN
src/data/default.db
Normal file
Binary file not shown.
58
src/error.php
Normal file
58
src/error.php
Normal file
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Web page used to HTTP display error pages for
|
||||
* the SeekQuarry/Yioop Search engine
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop;
|
||||
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\controllers\StaticController;
|
||||
|
||||
if (!isset($_REQUEST['p']) ||
|
||||
!in_array($_REQUEST['p'], ["404", "409"])) {
|
||||
$_REQUEST['p'] = "404";
|
||||
}
|
||||
switch ($_REQUEST['p']) {
|
||||
case "404":
|
||||
header("HTTP/1.0 404 Not Found");
|
||||
break;
|
||||
case "409":
|
||||
header("HTTP/1.0 409 Conflict");
|
||||
break;
|
||||
}
|
||||
$_REQUEST['c'] = "static";
|
||||
define('seekquarry\\yioop\\configs\\SKIP_BOOTSTRAP', true);
|
||||
/**
|
||||
* load in main entry point
|
||||
*/
|
||||
require_once(__DIR__."/index.php");
|
||||
bootstrap();
|
||||
exit();
|
BIN
src/examples/0-Archive1421025145.zip
Normal file
BIN
src/examples/0-Archive1421025145.zip
Normal file
Binary file not shown.
BIN
src/examples/IndexData1421025145.zip
Normal file
BIN
src/examples/IndexData1421025145.zip
Normal file
Binary file not shown.
82
src/examples/QueryCacher.php
Normal file
82
src/examples/QueryCacher.php
Normal file
|
@ -0,0 +1,82 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\examples;
|
||||
|
||||
/**
|
||||
* Script to cache run a sequence of queries against a yioop instance
|
||||
* so that they can be cached
|
||||
*/
|
||||
define("YIOOP_URL", "http://localhost/");
|
||||
define("TIME_BETWEEN_REQUEST_IN_SECONDS", 5);
|
||||
define("QUERY_AGENT_NAME", "QUERY_CACHER");
|
||||
if (empty($argv[1])) {
|
||||
echo <<< EOD
|
||||
QUERY_CACHER
|
||||
============
|
||||
This program runs a sequence of queries against a Yioop Installation.
|
||||
If file caching is turned on for that Yioop Installation, then those query
|
||||
will be saved to its cache. To run this program, type a command like:
|
||||
php QueryCacher.php file_name.txt
|
||||
Here file_name.txt is the name of a text file with one query/line.
|
||||
EOD;
|
||||
exit();
|
||||
} else {
|
||||
echo <<< EOD
|
||||
QUERY_CACHER
|
||||
============
|
||||
Now running a sequence of queries against the yioop installation at:
|
||||
|
||||
EOD;
|
||||
echo YIOOP_URL ."\n\n";
|
||||
}
|
||||
$queries = file($argv[1]);
|
||||
$agent = curl_init();
|
||||
curl_setopt($agent, CURLOPT_USERAGENT, QUERY_AGENT_NAME);
|
||||
curl_setopt($agent, CURLOPT_AUTOREFERER, true);
|
||||
curl_setopt($agent, CURLOPT_FOLLOWLOCATION, true);
|
||||
curl_setopt($agent, CURLOPT_SSL_VERIFYHOST, 0);
|
||||
curl_setopt($agent, CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt($agent, CURLOPT_NOSIGNAL, true);
|
||||
curl_setopt($agent, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($agent, CURLOPT_FAILONERROR, true);
|
||||
curl_setopt($agent, CURLOPT_TIMEOUT, TIME_BETWEEN_REQUEST_IN_SECONDS);
|
||||
curl_setopt($agent, CURLOPT_CONNECTTIMEOUT,
|
||||
TIME_BETWEEN_REQUEST_IN_SECONDS);
|
||||
curl_setopt($agent, CURLOPT_HTTPHEADER, ['Expect:']);
|
||||
$i = 1;
|
||||
foreach ($queries as $query) {
|
||||
echo $i . " ". $query;
|
||||
curl_setopt($agent, CURLOPT_URL, YIOOP_URL . "?q=". urlencode($query));
|
||||
$response = curl_exec($agent);
|
||||
$i++;
|
||||
sleep(TIME_BETWEEN_REQUEST_IN_SECONDS);
|
||||
}
|
||||
curl_close($agent);
|
201
src/examples/SearchApi.php
Normal file
201
src/examples/SearchApi.php
Normal file
|
@ -0,0 +1,201 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\examples;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\controllers\SearchController;
|
||||
/**
|
||||
* This file contains an example script to show the different
|
||||
* methods of the Yioop! search api
|
||||
*/
|
||||
// this example should be only run from the command-line
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
/** Calculate base directory of script @ignore
|
||||
* If you have Yioop! in a separate folder from your web-site
|
||||
* You should change BASE_DIR to the location of the Yioop! directory
|
||||
*/
|
||||
define("seekquarry\\yioop\\configs\\PARENT_DIR",
|
||||
substr(dirname(realpath($_SERVER['PHP_SELF'])), 0,
|
||||
-strlen("/src/examples")));
|
||||
define("seekquarry\\yioop\\configs\\BASE_DIR", C\PARENT_DIR . "/src");
|
||||
/** Load in global configuration settings; you need this*/
|
||||
require_once C\BASE_DIR.'/configs/Config.php';
|
||||
if (!C\PROFILE) {
|
||||
echo "Please configure the search engine instance by visiting" .
|
||||
"its web interface on localhost.\n";
|
||||
exit();
|
||||
}
|
||||
/*
|
||||
* We now move the search API test index over to the WORK_DIRECTORY
|
||||
* if it isn't already there. In a real-world set-up a user would have
|
||||
* put a crawl into the WORK_DIRECTORY and that would be used to make the
|
||||
* query.
|
||||
*/
|
||||
$archive_timestamp = "1421025145";
|
||||
$archive = C\BASE_DIR."/examples/0-Archive$archive_timestamp.zip";
|
||||
$index_archive = C\BASE_DIR."/examples/IndexData$archive_timestamp.zip";
|
||||
$extract_folder = C\CRAWL_DIR."/cache";
|
||||
if (!file_exists($archive) ||
|
||||
!file_exists($index_archive)) {
|
||||
echo "\nSearch API test index doesn't exist, so can't run demo\n\n";
|
||||
exit();
|
||||
}
|
||||
if (class_exists("\ZipArchive")) {
|
||||
$zip = new \ZipArchive();
|
||||
$zip_h = $zip->open($archive);
|
||||
$zip->extractTo($extract_folder);
|
||||
$zip->close();
|
||||
$zip_h = $zip->open($index_archive);
|
||||
$zip->extractTo($extract_folder);
|
||||
$zip->close();
|
||||
} else {
|
||||
exec("unzip $archive -d $extract_folder");
|
||||
exec("unzip $index_archive -d $extract_folder");
|
||||
}
|
||||
// Create a SearchController to do queries with
|
||||
$controller = new SearchController();
|
||||
/*
|
||||
Now we can do queries! First do a simple search on art and print the results
|
||||
*/
|
||||
echo "\n\n\nAn example of a query request with the search API:\n";
|
||||
echo "Total rows numbers are high because by default grouping is done.\n";
|
||||
|
||||
$query = "art i:$archive_timestamp";
|
||||
/* i:1317414322 is the timestamp of the index to use.
|
||||
API requires that a default index be set even though the query might
|
||||
specify to use a different one. The query string we pass to the
|
||||
API can be anything you can type into Yioop! search box.
|
||||
*/
|
||||
$num_results = 10; // how many results to get back
|
||||
$first_result_to_return = 0;
|
||||
// what ranked results show be the first to be returned (0 = highest ranked)
|
||||
$data = $controller->queryRequest($query, $num_results,
|
||||
$first_result_to_return);
|
||||
outputQueryData($data);
|
||||
|
||||
/*
|
||||
next we do a related search (as our index only has one page in it)
|
||||
the only related page is the page itself
|
||||
*/
|
||||
echo "\n\n\nAn example of making a related query request with the search API\n";
|
||||
$url = "http://www.ucanbuyart.com/";
|
||||
$num_results = 10; // how many results to get back
|
||||
$first_result_to_return = 0;
|
||||
$index_timestamp = $archive_timestamp;
|
||||
$data = $controller->relatedRequest($url, $num_results,
|
||||
$first_result_to_return, $index_timestamp);
|
||||
outputQueryData($data);
|
||||
/*
|
||||
Finally, we give an example of requesting the cached version of
|
||||
a downloaded page...
|
||||
*/
|
||||
echo "\n\n\nAn example of making a cached of page request".
|
||||
" with the search API:\n";
|
||||
$url = "http://www.ucanbuyart.com/";
|
||||
$ui_flags = [];
|
||||
$search_terms = "art classifieds"; // these words will be highlighted
|
||||
$index_timestamp = $archive_timestamp;
|
||||
$data = $controller->cacheRequest($url, $ui_flags,
|
||||
$search_terms, $index_timestamp);
|
||||
echo $data;
|
||||
/*
|
||||
We now delete the example index to clean-up our test. In real-life
|
||||
you wouldn't want to delete your query index after making one query
|
||||
*/
|
||||
unlinkRecursive(C\CRAWL_DIR."/cache/0-Archive$archive_timestamp");
|
||||
unlinkRecursive(C\CRAWL_DIR."/cache/IndexData$archive_timestamp");
|
||||
// demo over, bye-bye for now!
|
||||
exit();
|
||||
/**
|
||||
* Short function to pretty-print the data gotten back from a Yioop! query
|
||||
* @param array $data what we got back from doing a query
|
||||
*/
|
||||
function outputQueryData($data)
|
||||
{
|
||||
// Now to print out info in the result
|
||||
foreach ($data['PAGES'] as $page) {
|
||||
echo "============\n";
|
||||
echo "TITLE: ". trim($page[CrawlConstants::TITLE]). "\n";
|
||||
echo "URL: ". trim($page[CrawlConstants::URL]). "\n";
|
||||
echo "DESCRIPTION:".
|
||||
wordwrap(trim($page[CrawlConstants::DESCRIPTION]))."\n";
|
||||
echo "Rank: ".$page[CrawlConstants::DOC_RANK]."\n";
|
||||
echo "Relevance: ".$page[CrawlConstants::RELEVANCE]."\n";
|
||||
echo "Proximity: ".$page[CrawlConstants::PROXIMITY]."\n";
|
||||
echo "Score: ".$page[CrawlConstants::SCORE]."\n";
|
||||
echo "============\n\n";
|
||||
}
|
||||
|
||||
echo "QUERY STATISTICS\n";
|
||||
echo "============\n";
|
||||
echo "LOW: ".$data['LIMIT']."\n";
|
||||
echo "HIGH: ".min($data['TOTAL_ROWS'],
|
||||
$data['LIMIT'] + $data['RESULTS_PER_PAGE'])."\n";
|
||||
echo "TOTAL ROWS: ".$data['TOTAL_ROWS']."\n";
|
||||
}
|
||||
/**
|
||||
* Recursively delete a directory
|
||||
*
|
||||
* @param string $dir Directory name
|
||||
* @param boolean $deleteRootToo Delete specified top directory as well
|
||||
*/
|
||||
function unlinkRecursive($dir, $deleteRootToo = true)
|
||||
{
|
||||
traverseDirectory($dir, C\NS_LIB . "deleteFileOrDir", $deleteRootToo);
|
||||
}
|
||||
/**
|
||||
* Recursively traverse a directory structure and call a callback function
|
||||
*
|
||||
* @param string $dir Directory name
|
||||
* @param function $callback Function to call as traverse structure
|
||||
* @param boolean $rootToo do op on top-level directory as well
|
||||
*/
|
||||
function traverseDirectory($dir, $callback, $rootToo = true)
|
||||
{
|
||||
if (!$dh = @opendir($dir)) {
|
||||
return;
|
||||
}
|
||||
while (false !== ($obj = readdir($dh))) {
|
||||
if ($obj == '.' || $obj == '..') {
|
||||
continue;
|
||||
}
|
||||
if (is_dir($dir . '/' . $obj)) {
|
||||
traverseDirectory($dir.'/'.$obj, $callback, true);
|
||||
}
|
||||
@$callback($dir . '/' . $obj);
|
||||
}
|
||||
closedir($dh);
|
||||
if ($rootToo) {
|
||||
@$callback($dir);
|
||||
}
|
||||
}
|
149
src/examples/WeatherBot.php
Normal file
149
src/examples/WeatherBot.php
Normal file
|
@ -0,0 +1,149 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Harika Nukala harika.nukala@sjsu.edu
|
||||
* @package seek_quarry
|
||||
* @subpackage examples
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\examples\weatherbot;
|
||||
|
||||
/**
|
||||
* This class demonstrates a simple Weather Chat Bot using the Yioop
|
||||
* ChatBot APIs for Yioop Discussion Groups.
|
||||
* To use this bot:
|
||||
* (1) Move this file to some folder of a web server you have access to.
|
||||
* Denote by some_url the url of this folder. If you point your
|
||||
* browser at this folder you should see a message that begins with:
|
||||
* There was a configuration issue with your query.
|
||||
* (2) Create a new Yioop User.
|
||||
* (3) Under Manage Accounts, click on the lock symbol next to Account Details
|
||||
* (4) Check the Bot User check bot, click save.
|
||||
* (5) Two form variables should appear: Bot Unique Token and Bot Callback URL.
|
||||
* Fill in a value for Bot Unique Token that matches the value set
|
||||
* for ACCESS_TOKEN in the code within the WeatherBot class.
|
||||
* Fill in some_url (as defined in step (1)) for the value of Bot Callback
|
||||
* URL
|
||||
* (6) Add the the user you created in Yioop to the group that you would like
|
||||
* the bot to service. Let the name of this user be user_name.
|
||||
* (7) Talk to your bot in yioop in this groups by commenting on an
|
||||
* already existing thread with a message beginning with @user_name.
|
||||
*/
|
||||
class WeatherBot
|
||||
{
|
||||
/**
|
||||
* Url of site that this bot gets weather information from
|
||||
*/
|
||||
const WEATHER_URL = "http://query.yahooapis.com/v1/public/yql";
|
||||
/**
|
||||
* Token given when setting up the bot in Yioop for callback requests
|
||||
* This bots checks that a request from a Yioop Intance sends
|
||||
* a timestamp as well as the hash of this timestamp with the bot_token
|
||||
* and post data and that these match the expected values
|
||||
*/
|
||||
const ACCESS_TOKEN = "bot_token";
|
||||
/**
|
||||
* Number of seconds that the passed timestamp can differ from the current
|
||||
* time on the WeatherBot machine.
|
||||
*/
|
||||
const TIME_WINDOW = 60;
|
||||
/**
|
||||
* This is the method called to get the WeatherBot to handle an incoming
|
||||
* HTTP request, and echo a weather realted message
|
||||
*/
|
||||
function processRequest()
|
||||
{
|
||||
$result = "There was a configuration issue with your query.";
|
||||
if ($this->checkBotToken() && !empty($_REQUEST['post']) &&
|
||||
!empty($_REQUEST['bot_name'])) {
|
||||
$location = filter_var($_REQUEST['post'], \FILTER_SANITIZE_STRING);
|
||||
$location = trim(mb_strtolower($location));
|
||||
$result = $this->getWeather($location);
|
||||
if (empty($result)) {
|
||||
$result = "I failed to find the weather for that location.\n".
|
||||
"I respond to queries in the format:\n" .
|
||||
" @{$_REQUEST['bot_name']} some_location";
|
||||
}
|
||||
}
|
||||
echo $result;
|
||||
}
|
||||
/**
|
||||
* This method is used to check a request that it comes from a site
|
||||
* that knows the bot_token in use by this WeatherBot.
|
||||
*/
|
||||
function checkBotToken()
|
||||
{
|
||||
if (!empty($_REQUEST['bot_token'])) {
|
||||
$token_parts = explode("*", $_REQUEST['bot_token']);
|
||||
$post = empty($_REQUEST["post"]) ? "" : $_REQUEST["post"];
|
||||
$hash = hash("sha256", self::ACCESS_TOKEN . $token_parts[1].
|
||||
$post);
|
||||
if (isset($token_parts[1]) &&
|
||||
abs(time() - $token_parts[1]) < self::TIME_WINDOW) {
|
||||
// second check avoids timing attacks, works for > php 5.6
|
||||
if ((!function_exists('hash_equals') &&
|
||||
$hash == $token_parts[0]) ||
|
||||
hash_equals($hash, $token_parts[0])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Get weather information about a location
|
||||
*
|
||||
* @param string $location the location to get weather updates for
|
||||
* @return string weather information
|
||||
*/
|
||||
function getWeather($location)
|
||||
{
|
||||
$yql_query = "select * from weather.forecast where woeid in
|
||||
(select woeid from geo.places(1) where text='" . $location
|
||||
."')";
|
||||
$url = self::WEATHER_URL . "?q=" .
|
||||
urlencode($yql_query) . "&format=json";
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
$data = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
$result = @json_decode($data);
|
||||
$temp = empty($result->query->results->channel->item->condition->temp) ?
|
||||
"" : $result->query->results->channel->item->condition->temp;
|
||||
$text = empty($result->query->results->channel->item->condition->text) ?
|
||||
"" : mb_strtolower(
|
||||
$result->query->results->channel->item->condition->text);
|
||||
if (empty($temp) || empty($text)) {
|
||||
return "";
|
||||
}
|
||||
return "The weather is $temp and $text in $location.";
|
||||
}
|
||||
}
|
||||
$bot = new WeatherBot();
|
||||
$bot->processRequest();
|
||||
|
1116
src/executables/ArcTool.php
Normal file
1116
src/executables/ArcTool.php
Normal file
File diff suppressed because it is too large
Load diff
734
src/executables/ClassifierTool.php
Normal file
734
src/executables/ClassifierTool.php
Normal file
|
@ -0,0 +1,734 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\executables;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\controllers\ClassifierController;
|
||||
use seekquarry\yioop\library\classifiers\Classifier;
|
||||
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
/** Load in global configuration settings */
|
||||
require_once __DIR__.'/../configs/Config.php';
|
||||
if (!C\PROFILE) {
|
||||
echo "Please configure the search engine instance by visiting" .
|
||||
"its web interface on localhost.\n";
|
||||
exit();
|
||||
}
|
||||
/**
|
||||
* Immediately throw an exception for all notices and warnings, rather than
|
||||
* letting execution continue.
|
||||
* @ignore
|
||||
*/
|
||||
function handleError($errno, $err_str, $err_file, $err_line)
|
||||
{
|
||||
if (error_reporting() == 0) {
|
||||
// Error suppressed by @, so ignore.
|
||||
return;
|
||||
}
|
||||
$msg = "$err_str in $err_file on line $err_line";
|
||||
if ($errno == E_NOTICE || $errno == E_WARNING) {
|
||||
throw new \ErrorException($msg, $errno);
|
||||
} else {
|
||||
echo $msg;
|
||||
}
|
||||
}
|
||||
set_error_handler(C\NS_LIB . 'classifiers\\handleError');
|
||||
|
||||
/**
|
||||
* Instructions for how to use classifier tool
|
||||
* @var string
|
||||
*/
|
||||
$INSTRUCTIONS = <<<EOD
|
||||
|
||||
This tool is used to automate the building and testing of classifiers,
|
||||
providing an alternative to the web interface when a labeled training set is
|
||||
available.
|
||||
|
||||
ClassifierTool.php takes an activity to perform, the name of a dataset to use,
|
||||
and a label for the constructed classifier. The activity is the name of one
|
||||
of the 'run*' functions implemented by this class, without the common 'run'
|
||||
prefix (e.g., 'TrainAndTest'). The dataset is specified as the common prefix
|
||||
of two indexes that have the suffixes "Pos" and "Neg", respectively. So if
|
||||
the prefix were "DATASET", then this tool would look for the two existing
|
||||
indexes "DATASET Pos" and "DATASET Neg" from which to draw positive and
|
||||
negative examples. Each document in these indexes should be a positive or
|
||||
negative example of the target class, according to whether it's in the "Pos"
|
||||
or "Neg" index. Finally, the label is just the label to be used for the
|
||||
constructed classifier.
|
||||
|
||||
Beyond these options (set with the -a, -d, and -l flags), a number of other
|
||||
options may be set to alter parameters used by an activity or a classifier.
|
||||
These options are set using the -S, -I, -F, and -B flags, which correspond
|
||||
to string, integer, float, and boolean parameters respectively. These flags
|
||||
may be used repeatedly, and each expects an argument of the form NAME=VALUE,
|
||||
where NAME is the name of a parameter, and VALUE is a value parsed according
|
||||
to the flag. The NAME should match one of the keys of the options member of
|
||||
this class, where a period ('.') may be used to specify nesting. For
|
||||
example:
|
||||
|
||||
-I debug=1 # set the debug level to 1
|
||||
-B cls.use_nb=0 # tell the classifier to use Naive Bayes
|
||||
|
||||
To build and evaluate a classifier for the label 'spam', trained using the
|
||||
two indexes "DATASET Neg" and "DATASET Pos", and a maximum of the top 25
|
||||
most informative features:
|
||||
|
||||
php ClassifierTool.php -a TrainAndTest -d 'DATASET' -l 'spam'
|
||||
-I cls.chi2.max=25
|
||||
|
||||
The above assume we are in the folder of ClassifierTool.php
|
||||
EOD;
|
||||
|
||||
/*
|
||||
* We'll set up multi-byte string handling to use UTF-8
|
||||
*/
|
||||
mb_internal_encoding("UTF-8");
|
||||
mb_regex_encoding("UTF-8");
|
||||
|
||||
/**
|
||||
* Class used to encapsulate all the activities of the ClassifierTool.php
|
||||
* command line script. This script allows one to automate the building and
|
||||
* testing of classifiers, providing an alternative to the web interface when
|
||||
*
|
||||
* a labeled training set is available.
|
||||
* @author Shawn Tice
|
||||
*/
|
||||
class ClassifierTool
|
||||
{
|
||||
/**
|
||||
* Reference to a classifier controller, used to manipulate crawl mixes in
|
||||
* the same way that the controller that handles web requests does.
|
||||
* @var object
|
||||
*/
|
||||
protected $classifier_controller;
|
||||
|
||||
/**
|
||||
* Reference to a crawl model object, also used to manipulate crawl mixes.
|
||||
* @var object
|
||||
*/
|
||||
protected $crawl_model;
|
||||
/**
|
||||
* Options to be used by activities and constructed classifiers. These
|
||||
* options can be overridden by supplying an appropriate flag on the
|
||||
* command line, where nesting is denoted by a period (e.g., cls.chi2.max).
|
||||
* The supported options are:
|
||||
*
|
||||
* debug: An integer, the level of debug statements to print. Larger
|
||||
* integers specify more detailed debug output; the default value of
|
||||
* 0 indicates no debug output.
|
||||
*
|
||||
* max_train: An integer, the maximum number of examples to use when
|
||||
* training a classifier. The default value of null indicates that
|
||||
* all available training examples should be used.
|
||||
*
|
||||
* test_interval: An integer, the number of new training examples to be
|
||||
* added before a round of testing on ALL test instances is to be
|
||||
* executed. With an interval of 5, for example, after adding five
|
||||
* new training examples, the classifier would be finalized and used
|
||||
* to classify all test instances. The error is reported for each
|
||||
* round of testing. The default value of null indicates that
|
||||
* testing should only occur after all training examples have been
|
||||
* added.
|
||||
*
|
||||
* split: An integer, the number of examples from the entire set of
|
||||
* labeled examples to use for training. The remainder are used for
|
||||
* testing.
|
||||
*
|
||||
* cls.use_nb: A boolean, whether or not to use the Naive Bayes
|
||||
* classification algorithm instead of the logistic regression one
|
||||
* in order to finalize the classifier. The default value is false,
|
||||
* indicating that logistic regression should be used.
|
||||
*
|
||||
* cls.chi2.max: An integer, the maximum number of features to use when
|
||||
* training the classifier. The default is a relatively
|
||||
* conservative 200.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public $options = [
|
||||
'debug' => 0,
|
||||
'max_train' => null,
|
||||
'test_interval' => null,
|
||||
'split' => 3000,
|
||||
'cls' => [
|
||||
'use_nb' => false,
|
||||
'chi2' => [
|
||||
'max' => 200
|
||||
]
|
||||
]
|
||||
];
|
||||
/**
|
||||
* Initializes the classifier controller and crawl model that will be used
|
||||
* to manage crawl mixes, used for iterating over labeled examples.
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->classifier_controller = new ClassifierController();
|
||||
$this->crawl_model = $this->classifier_controller->model("crawl");
|
||||
}
|
||||
/**
|
||||
* Parses the command-line options, returns the required arguments, and
|
||||
* updates the member variable $options with any parameters. If any of the
|
||||
* required arguments (activity, dataset, or label) are missing, then a
|
||||
* message is printed and the program exits. The optional arguments used to
|
||||
* set parameters directly modify the class state through the setOptions
|
||||
* method.
|
||||
*
|
||||
* @return array the parsed activity, dataset, and label
|
||||
*/
|
||||
public function parseOptions()
|
||||
{
|
||||
$shortopts = 'l:a:d:S:I:F:B:';
|
||||
$options = getopt($shortopts);
|
||||
if (!isset($options['a'])) {
|
||||
echo "missing -a flag to choose activity to run\n";
|
||||
exit(1);
|
||||
}
|
||||
if (!isset($options['l'])) {
|
||||
echo "missing -l flag to set classifier label\n";
|
||||
exit(1);
|
||||
}
|
||||
if (!isset($options['d'])) {
|
||||
echo "missing -d flag to choose dataset to use\n";
|
||||
exit(1);
|
||||
}
|
||||
$activity = $options['a'];
|
||||
$label = Classifier::cleanLabel($options['l']);
|
||||
$dataset_name = $options['d'];
|
||||
unset($options['a'], $options['l'], $options['d']);
|
||||
foreach ($options as $opt_name => $value) {
|
||||
switch ($opt_name) {
|
||||
case 'S':
|
||||
$this->setOptions($value);
|
||||
break;
|
||||
case 'I':
|
||||
$this->setOptions($value, 'intval');
|
||||
break;
|
||||
case 'F':
|
||||
$this->setOptions($value, 'floatval');
|
||||
break;
|
||||
case 'B':
|
||||
$this->setOptions($value, 'boolval');
|
||||
break;
|
||||
default:
|
||||
echo "unsupported option: {$opt_name}\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
return [$activity, $dataset_name, $label];
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the options, and if an appropriate activity exists, calls the
|
||||
* activity, passing in the label and dataset to be used; otherwise, prints
|
||||
* an error and exits.
|
||||
*/
|
||||
public function main()
|
||||
{
|
||||
global $argv, $INSTRUCTIONS;
|
||||
if (count($argv) < 2) {
|
||||
echo $INSTRUCTIONS;
|
||||
exit(1);
|
||||
}
|
||||
list($activity, $dataset_name, $label) = $this->parseOptions();
|
||||
$method = "run{$activity}";
|
||||
if (method_exists($this, $method)) {
|
||||
$this->$method($label, $dataset_name);
|
||||
} else {
|
||||
echo "no activity: {$activity}\n\n";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/* ACTIVITIES */
|
||||
|
||||
/**
|
||||
* Trains a classifier on a data set, testing at the specified intervals.
|
||||
* The testing interval is set by the test_interval parameter. Each time
|
||||
* this activity is run a new classifier is created (replacing an old one
|
||||
* with the same label, if necessary), and the classifier remains at the
|
||||
* end.
|
||||
*
|
||||
* @param string $label class label of the new classifier
|
||||
* @param string $dataset_name name of the dataset to train and test on
|
||||
*/
|
||||
public function runTrainAndTest($label, $dataset_name)
|
||||
{
|
||||
$this->setDefault('max_train', 200);
|
||||
$this->logOptions();
|
||||
$classifier = $this->makeFreshClassifier($label);
|
||||
$data = $this->loadDataset($dataset_name, $label);
|
||||
$classifier->initBuffer($data['train'], 0);
|
||||
$pages = $data['train'];
|
||||
$classifier->prepareToLabel();
|
||||
$end = min($this->options['max_train'], $pages->length);
|
||||
for ($i = 1; $i <= $end; $i++) {
|
||||
$page = $pages->nextPage();
|
||||
$doc_label = $page['TRUE_LABEL'];
|
||||
$key = Classifier::makeKey($page);
|
||||
$classifier->addBufferDoc($page, false);
|
||||
$classifier->labelDocument($key, $doc_label, false);
|
||||
if ($this->isTestPoint($i, $end)) {
|
||||
Classifier::setClassifier($classifier);
|
||||
$this->testClassifier($classifier, $data);
|
||||
/*
|
||||
Testing the classifier puts it into "classify" mode, which
|
||||
will uses a different set of data from "label" mode, so it's
|
||||
important to switch back.
|
||||
*/
|
||||
$classifier->prepareToLabel();
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Like the TrainAndTest activity, but uses active training in order to
|
||||
* choose the documents to add to the training set. The method simulates
|
||||
* the process that an actual user would go through in order to label
|
||||
* documents for addition to the training set, then tests performance at
|
||||
* the specified intervals.
|
||||
*
|
||||
* @param string $label class label of the new classifier
|
||||
* @param string $dataset_name name of the dataset to train and test on
|
||||
*/
|
||||
public function runActiveTrainAndTest($label, $dataset_name)
|
||||
{
|
||||
$this->setDefault('max_train', 200);
|
||||
$this->logOptions();
|
||||
$classifier = $this->makeFreshClassifier($label);
|
||||
$data = $this->loadDataset($dataset_name, $label);
|
||||
$pages = $data['train'];
|
||||
$classifier->prepareToLabel();
|
||||
$classifier->initBuffer($pages);
|
||||
$end = min($this->options['max_train'], $pages->length);
|
||||
for ($i = 1; $i <= $end; $i++) {
|
||||
list($new_doc, $disagreement) =
|
||||
$classifier->findNextDocumentToLabel();
|
||||
if ($new_doc) {
|
||||
$key = Classifier::makeKey($new_doc);
|
||||
$doc_label = $new_doc['TRUE_LABEL'];
|
||||
$classifier->labelDocument($key, $doc_label);
|
||||
$classifier->refreshBuffer($pages);
|
||||
$classifier->computeBufferDensities();
|
||||
$classifier->train();
|
||||
}
|
||||
if ($this->isTestPoint($i, $end)) {
|
||||
Classifier::setClassifier($classifier);
|
||||
$this->testClassifier($classifier, $data);
|
||||
$classifier->prepareToLabel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* UTILITY METHODS */
|
||||
|
||||
/**
|
||||
* Creates a new classifier for a label, first deleting any existing
|
||||
* classifier with the same label.
|
||||
*
|
||||
* @param string $label class label of the new classifier
|
||||
* @return object created classifier instance
|
||||
*/
|
||||
public function makeFreshClassifier($label)
|
||||
{
|
||||
if ($classifier = Classifier::getClassifier($label)) {
|
||||
$this->deleteClassifier($label);
|
||||
}
|
||||
$classifier = new Classifier($label, $this->options['cls']);
|
||||
Classifier::setClassifier($classifier);
|
||||
return $classifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes an existing classifier, specified by its label.
|
||||
*
|
||||
* @param string $label class label of the existing classifier
|
||||
*/
|
||||
public function deleteClassifier($label)
|
||||
{
|
||||
Classifier::deleteClassifier($label);
|
||||
$mix_name = Classifier::getCrawlMixName($label);
|
||||
$mix_time = $this->crawl_model->getCrawlMixTimestamp($mix_name);
|
||||
if ($mix_time) {
|
||||
$this->crawl_model->deleteCrawlMixIteratorState($mix_time);
|
||||
$this->crawl_model->deleteCrawlMix($mix_time);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Fetches the summaries for pages in the indices specified by the passed
|
||||
* dataset name. This method looks for existing indexes with names matching
|
||||
* the dataset name prefix, and with suffix either "pos" or "neg" (ignoring
|
||||
* case). The pages in these indexes are shuffled into one large array, and
|
||||
* augmented with a TRUE_LABEL field that records which set they came from
|
||||
* originally. The shuffled array is then split according to the `split'
|
||||
* option, and all pages up to (but not including) the split index are used
|
||||
* for the training set; the remaining pages are used for the test set.
|
||||
*
|
||||
* @param string $dataset_name prefix of index names to draw examples from
|
||||
* @param string $class_label class label of the classifier the examples
|
||||
* will be used to train (used to name the crawl mix that iterates over
|
||||
* each index)
|
||||
* @return array training and test datasets in an associative array with
|
||||
* keys `train' and `test', where each dataset is wrapped up in a
|
||||
* PageIterator that implements the CrawlMixIterator interface.
|
||||
*/
|
||||
public function loadDataset($dataset_name, $class_label)
|
||||
{
|
||||
$crawls = $this->crawl_model->getCrawlList(false, true, null);
|
||||
$dataset_name = preg_quote($dataset_name, '/');
|
||||
$re = '/^RECRAWL::'.$dataset_name.' (pos|neg)$/i';
|
||||
$pages = [];
|
||||
foreach ($crawls as $crawl) {
|
||||
if (!preg_match($re, $crawl['DESCRIPTION'], $groups)) {
|
||||
continue;
|
||||
}
|
||||
$label = strtolower($groups[1]);
|
||||
$doc_label = $label == 'pos' ? 1 : -1;
|
||||
$mix_iterator =
|
||||
$this->classifier_controller->buildClassifierCrawlMix(
|
||||
$class_label, $crawl['CRAWL_TIME']);
|
||||
while (!$mix_iterator->end_of_iterator) {
|
||||
$new_pages = $mix_iterator->nextPages(5000);
|
||||
/*
|
||||
This field can be added to the results from a crawl mix
|
||||
iterator, but we don't care about it, so we just discard it.
|
||||
*/
|
||||
if (isset($new_pages['NO_PROCESS'])) {
|
||||
unset($new_pages['NO_PROCESS']);
|
||||
}
|
||||
foreach ($new_pages as $page) {
|
||||
$page['TRUE_LABEL'] = $doc_label;
|
||||
$pages[] = $page;
|
||||
}
|
||||
}
|
||||
}
|
||||
shuffle($pages);
|
||||
if (count($pages) < $this->options['split']) {
|
||||
echo "split is larger than dataset\n";
|
||||
exit(1);
|
||||
}
|
||||
$data = [];
|
||||
$data['train'] = new PageIterator(
|
||||
array_slice($pages, 0, $this->options['split']));
|
||||
$data['test'] = new PageIterator(
|
||||
array_slice($pages, $this->options['split']));
|
||||
return $data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether to run a classification test after a certain number
|
||||
* of documents have been added to the training set. Whether or not to test
|
||||
* is determined by the `test_interval' option, which may be either null,
|
||||
* an integer, or a string. In the first case, testing only occurs after
|
||||
* all training examples have been added; in the second case, testing
|
||||
* occurs each time an additional constant number of training examples have
|
||||
* been added; and in the final case, testing occurs on a fixed schedule of
|
||||
* comma-separated offsets, such as "10,25,50,100".
|
||||
*
|
||||
* @param int $i the size of the current training set
|
||||
* @param int $total the total number of documents available to be added to
|
||||
* the training set
|
||||
* @return bool true if the `test_interval' option specifies that a round
|
||||
* of testing should occur for the current training offset, and false
|
||||
* otherwise
|
||||
*/
|
||||
public function isTestPoint($i, $total)
|
||||
{
|
||||
if (is_null($this->options['test_interval'])) {
|
||||
return $i == $total;
|
||||
} else if (is_int($this->options['test_interval'])) {
|
||||
return $i % $this->options['test_interval'] == 0;
|
||||
} else {
|
||||
$re = '/(^|,)'.$i.'(,|$)/';
|
||||
return preg_match($re, $this->options['test_interval']);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Finalizes the current classifier, uses it to classify all test
|
||||
* documents, and logs the classification error. The current classifier is
|
||||
* saved to disk after finalizing (though not before), and left in
|
||||
* `classify' mode. The iterator over the test dataset is reset for the
|
||||
* next round of testing (if any).
|
||||
*
|
||||
* @param object $classifier classifier instance to test
|
||||
* @param array $data the array of training and test datasets, constructed
|
||||
* by loadDataset, of which only the `test' dataset it used.
|
||||
*/
|
||||
public function testClassifier($classifier, $data)
|
||||
{
|
||||
$classifier->prepareToFinalize();
|
||||
$classifier->finalize();
|
||||
Classifier::setClassifier($classifier);
|
||||
$classifier->prepareToClassify();
|
||||
$wrong = 0;
|
||||
$total = 0;
|
||||
$pages = $data['test'];
|
||||
while (!$pages->end_of_iterator) {
|
||||
$page = $pages->nextPage();
|
||||
$score = $classifier->classify($page);
|
||||
$page_label = $score >= 0.5 ? 1 : -1;
|
||||
if ($page_label != $page['TRUE_LABEL']) {
|
||||
$wrong++;
|
||||
}
|
||||
$total++;
|
||||
}
|
||||
$error = (float)$wrong / $total;
|
||||
$this->log(0, 'error = %.4f', $error);
|
||||
$pages->reset();
|
||||
}
|
||||
/**
|
||||
* Writes out logging information according to a detail level. The first
|
||||
* argument is an integer (potentially negative) indicating the level of
|
||||
* detail for the log message, where larger numbers indicate greater
|
||||
* detail. Each message is prefixed with a character according to its level
|
||||
* of detail, but if the detail level is greater than the level specified
|
||||
* by the `debug' option then nothing is printed. The treatment for the
|
||||
* available detail levels are as follows:
|
||||
*
|
||||
* -2: Used for errors; always printed; prefix '! '
|
||||
* -1: Used for log of set options; always printed; prefix '# '
|
||||
* 0+: Used for normal messages; prefix '> '
|
||||
*
|
||||
* The second argument is a printf-style string template specifying the
|
||||
* message, and each following (optional) argument is used by the template.
|
||||
* A newline is added automatically to each message.
|
||||
*
|
||||
* @param int $level level of detail for the message
|
||||
* @param string $message printf-style template for the message
|
||||
* @param string $args,... optional arguments to be used for the message
|
||||
* template
|
||||
*/
|
||||
public function log(/* varargs */)
|
||||
{
|
||||
$args = func_get_args();
|
||||
$level = array_shift($args);
|
||||
if ($level > $this->options['debug']) {
|
||||
return;
|
||||
}
|
||||
if ($level == -2) {
|
||||
echo '! ';
|
||||
} else if ($level == -1) {
|
||||
echo '# ';
|
||||
} else {
|
||||
echo '> ';
|
||||
}
|
||||
call_user_func_array('printf', $args);
|
||||
echo "\n";
|
||||
}
|
||||
/**
|
||||
* Logs the current options using the log method of this class. This method
|
||||
* is used to explicitly state which settings were used for a given run of
|
||||
* an activity. The detail level passed to the log method is -1.
|
||||
*
|
||||
* @param string $root folder to write to
|
||||
* @param string $prefix to pre message (like Warning) to put at start of
|
||||
* log message
|
||||
*/
|
||||
public function logOptions($root = null, $prefix = '')
|
||||
{
|
||||
if (is_null($root)) {
|
||||
$root = $this->options;
|
||||
}
|
||||
foreach ($root as $key => $value) {
|
||||
if (is_array($value)) {
|
||||
$this->logOptions($value, $prefix.$key.'.');
|
||||
} else if (!is_null($value)) {
|
||||
if ($value === false) $value = 'false';
|
||||
else if ($value === true) $value = 'true';
|
||||
$this->log(-1, '%s%s = %s', $prefix, $key, strval($value));
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Sets one or more options of the form NAME=VALUE according to a converter
|
||||
* such as intval, floatval, and so on. The options may be passed in either
|
||||
* as a string (a single option) or as an array of strings, where each
|
||||
* string corresponds to an option of the same type (e.g., int).
|
||||
*
|
||||
* @param string|array $opts single option in the format NAME=VALUE, or
|
||||
* array of options, each for the same target type (e.g., int)
|
||||
* @param string $converter the name of a function that takes a string and
|
||||
* casts it to a particular type (e.g., intval, floatval)
|
||||
*/
|
||||
public function setOptions($opts, $converter = null)
|
||||
{
|
||||
if (!is_array($opts)) {
|
||||
$opts = [$opts];
|
||||
}
|
||||
foreach ($opts as $opt) {
|
||||
$split = strpos($opt, '=');
|
||||
$name = substr($opt, 0, $split);
|
||||
$value = substr($opt, $split + 1);
|
||||
if ($converter) {
|
||||
if ($converter == 'boolval' && !function_exists('boolval')) {
|
||||
$value = (bool)$value;
|
||||
} else {
|
||||
$value = call_user_func($converter, $value);
|
||||
}
|
||||
}
|
||||
$fields = explode('.', $name);
|
||||
$field =& $this->options;
|
||||
while (!empty($fields)) {
|
||||
$top = array_shift($fields);
|
||||
if (array_key_exists($top, $field)) {
|
||||
$field =& $field[$top];
|
||||
} else {
|
||||
$this->log(-2, 'unknown option: "%s"', $name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (empty($fields)) {
|
||||
$field = $value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets a default value for a runtime parameter. This method is used by
|
||||
* activities to specify default values that may be overridden by passing
|
||||
* the appropriate command-line flag.
|
||||
*
|
||||
* @param string $name should end with name of runtime parameter to set
|
||||
* @param string $value what to set it to
|
||||
*/
|
||||
public function setDefault($name, $value)
|
||||
{
|
||||
$fields = explode('.', $name);
|
||||
$field =& $this->options;
|
||||
while (count($fields) > 1) {
|
||||
$top = array_shift($fields);
|
||||
$field =& $field[$top];
|
||||
}
|
||||
$last = array_shift($fields);
|
||||
if (!isset($field[$last])) {
|
||||
$field[$last] = $value;
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* This class provides the same interface as an iterator over crawl mixes, but
|
||||
* simply iterates over an array.
|
||||
*
|
||||
* This is used to gather all of the pages for a training set in one go (using
|
||||
* a crawl mix iterator), then repeatedly iterate over them in memory, as
|
||||
* though they were coming from the original crawl mix iterator.
|
||||
*
|
||||
* @author Shawn Tice
|
||||
*/
|
||||
class PageIterator
|
||||
{
|
||||
/**
|
||||
* The array of pages to repeatedly iterate over.
|
||||
* @var array
|
||||
*/
|
||||
public $pages;
|
||||
|
||||
/**
|
||||
* The total number of pages.
|
||||
* @var int
|
||||
*/
|
||||
public $length;
|
||||
|
||||
/**
|
||||
* The current offset into the wrapped array.
|
||||
* @var int
|
||||
*/
|
||||
public $pos;
|
||||
|
||||
/**
|
||||
* Whether or not the last page has been reached.
|
||||
* @var bool
|
||||
*/
|
||||
public $end_of_iterator;
|
||||
|
||||
/**
|
||||
* Establishes a new iterator over a (potentially empty) array of pages.
|
||||
*
|
||||
* @param array $pages standard array of pages to iterate over
|
||||
*/
|
||||
public function __construct($pages)
|
||||
{
|
||||
$this->pages = $pages;
|
||||
$this->length = count($pages);
|
||||
$this->reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the iterator so that the next page will be the first.
|
||||
*/
|
||||
public function reset()
|
||||
{
|
||||
$this->pos = 0;
|
||||
$this->end_of_iterator = $this->length == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns up to the requested number of next pages, potentially an empty
|
||||
* array if there are no pages left. This method updates the
|
||||
* `end_of_iterator' flag according to whether the last page has been
|
||||
* returned.
|
||||
*
|
||||
* @param int $n maximum number of pages to return, or -1 to return all
|
||||
* remaining pages
|
||||
* @return array next $n pages, or less if there are fewer than $n
|
||||
* pages remaining
|
||||
*/
|
||||
public function nextPages($n = -1)
|
||||
{
|
||||
if ($n == -1) {
|
||||
$n = $this->length - $this->pos;
|
||||
} else {
|
||||
$n = min($this->length - $this->pos, $n);
|
||||
}
|
||||
$start = $this->pos;
|
||||
$this->pos += $n;
|
||||
if ($this->pos == $this->length) {
|
||||
$this->end_of_iterator = true;
|
||||
}
|
||||
return array_slice($this->pages, $start, $n);
|
||||
}
|
||||
/**
|
||||
* Behaves like nextPages, but returns just the next page (not wrapped in
|
||||
* an array) if there is one, and null otherwise.
|
||||
*
|
||||
* @return array next page if available, and null otherwise
|
||||
*/
|
||||
public function nextPage()
|
||||
{
|
||||
$next = $this->nextPages(1);
|
||||
return !empty($next) ? $next[0] : null;
|
||||
}
|
||||
}
|
||||
try {
|
||||
$classifier_tool = new ClassifierTool();
|
||||
$classifier_tool->main();
|
||||
} catch (\ErrorException $e) {
|
||||
echo $e . "\n";
|
||||
}
|
105
src/executables/ClassifierTrainer.php
Normal file
105
src/executables/ClassifierTrainer.php
Normal file
|
@ -0,0 +1,105 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\executables;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlDaemon;
|
||||
use seekquarry\yioop\library\classifiers\Classifier;
|
||||
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
/*
|
||||
We must specify that we want logging enabled
|
||||
*/
|
||||
define("seekquarry\\yioop\\configs\\NO_LOGGING", false);
|
||||
/*
|
||||
For crawlLog and Yioop Constants
|
||||
*/
|
||||
require_once __DIR__.'/../library/Utility.php';
|
||||
if (!C\PROFILE) {
|
||||
echo "Please configure the search engine instance by visiting" .
|
||||
"its web interface on localhost.\n";
|
||||
exit();
|
||||
}
|
||||
/*
|
||||
We'll set up multi-byte string handling to use UTF-8
|
||||
*/
|
||||
mb_internal_encoding("UTF-8");
|
||||
mb_regex_encoding("UTF-8");
|
||||
/*
|
||||
If possible, set the memory limit high enough to fit all of the features and
|
||||
training documents into memory.
|
||||
*/
|
||||
ini_set("memory_limit", "500M");
|
||||
/**
|
||||
* This class is used to finalize a classifier via the web interface.
|
||||
*
|
||||
* Because finalizing involves training a logistic regression classifier on a
|
||||
* potentially-large set of training examples, it can take much longer than
|
||||
* would be allowed by the normal web execution time limit. So instead of
|
||||
* trying to finalize a classifier directly in the controller that handles the
|
||||
* web request, the controller kicks off a daemon that simply loads the
|
||||
* classifier, finalizes it, and saves it back to disk.
|
||||
*
|
||||
* The classifier to finalize is specified by its class label, passed as the
|
||||
* second command-line argument. The following command would be used to run
|
||||
* this script directly from the command-line:
|
||||
*
|
||||
* $ php bin/ClassifierTrainer.php terminal LABEL
|
||||
*
|
||||
* @author Shawn Tice
|
||||
*/
|
||||
class ClassifierTrainer
|
||||
{
|
||||
/**
|
||||
* This is the function that should be called to get the
|
||||
* ClassifierTrainer to start training a logistic regression instance for
|
||||
* a particular classifier. The class label corresponding to the
|
||||
* classifier to be finalized should be passed as the second command-line
|
||||
* argument.
|
||||
*/
|
||||
public function start()
|
||||
{
|
||||
global $argv;
|
||||
CrawlDaemon::init($argv, "ClassifierTrainer");
|
||||
$label = $argv[2];
|
||||
L\crawlLog("Initializing classifier trainer log..",
|
||||
$label.'-ClassifierTrainer', true);
|
||||
$classifier = Classifier::getClassifier($label);
|
||||
$classifier->prepareToFinalize();
|
||||
$classifier->finalize();
|
||||
Classifier::setClassifier($classifier);
|
||||
L\crawlLog("Training complete.\n");
|
||||
CrawlDaemon::stop('ClassifierTrainer', $label);
|
||||
}
|
||||
}
|
||||
$classifier_trainer = new ClassifierTrainer();
|
||||
$classifier_trainer->start();
|
461
src/executables/CodeTool.php
Normal file
461
src/executables/CodeTool.php
Normal file
|
@ -0,0 +1,461 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Tool used to help coding with Yioop. Has commands to update copyright info,
|
||||
* clean trailing spaces, find long lines, and do global file searches and
|
||||
* replaces.
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\executables;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\models\Model;
|
||||
use seekquarry\yioop\library\Utility;
|
||||
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
/** Load in global configuration settings */
|
||||
require_once __DIR__ . '/../configs/Config.php';
|
||||
if (!C\PROFILE) {
|
||||
echo "Please configure the search engine instance by visiting " .
|
||||
"its web interface on localhost.\n";
|
||||
exit();
|
||||
}
|
||||
/*
|
||||
* We'll set up multi-byte string handling to use UTF-8
|
||||
*/
|
||||
mb_internal_encoding("UTF-8");
|
||||
mb_regex_encoding("UTF-8");
|
||||
$no_instructions = false;
|
||||
$model = new Model();
|
||||
$db = $model->db;
|
||||
$commands = ["copyright", "clean", "longlines", "search", "replace"];
|
||||
$change_extensions = ["php", "js", "ini", "css", "thtml", "xml"];
|
||||
$exclude_paths_containing = ["/.", "/extensions/"];
|
||||
$num_spaces_tab = 4;
|
||||
if (isset($argv[1]) && in_array($argv[1], $commands)) {
|
||||
$command = C\NS_EXEC . $argv[1];
|
||||
array_shift($argv);
|
||||
array_shift($argv);
|
||||
$no_instructions = $command($argv);
|
||||
}
|
||||
if (!$no_instructions) {
|
||||
echo <<< EOD
|
||||
CodeTool.php has the following command formats:
|
||||
|
||||
php CodeTool.php clean path
|
||||
Replaces all tabs with four spaces and trims all whitespace off ends of
|
||||
lines in the folder or file path. Removes trailing ?> from files
|
||||
Adds a space between if, for, foreach, etc and ( if not present
|
||||
|
||||
php CodeTool.php copyright path
|
||||
Adjusts all lines in the files in the folder at path (or if
|
||||
path is a file just that) of the form 2009 - \d\d\d\d to
|
||||
the form 2009 - this_year where this_year is the current year.
|
||||
|
||||
php CodeTool.php longlines path
|
||||
Prints out all lines in files in the folder or file path which are
|
||||
longer than 80 characters.
|
||||
|
||||
php CodeTool.php replace path pattern replace_string
|
||||
or
|
||||
php CodeTool.php replace path pattern replace_string effect
|
||||
Prints all lines matching the regular expression pattern followed
|
||||
by the result of replacing pattern with replace_string in the
|
||||
folder or file path. Does not change files.
|
||||
|
||||
php CodeTool.php replace path pattern replace_string interactive
|
||||
Prints each line matching the regular expression pattern followed
|
||||
by the result of replacing pattern with replace_string in the
|
||||
folder or file path. Then it asks if you want to update the line.
|
||||
Lines you choose for updating will be modified in the files.
|
||||
|
||||
php CodeTool.php replace path pattern replace_string change
|
||||
Each line matching the regular expression pattern is update
|
||||
by replacing pattern with replace_string in the
|
||||
folder or file path. This format doe not echo anything, it does a global
|
||||
replace without interaction.
|
||||
|
||||
php CodeTool.php search path pattern
|
||||
Prints all lines matching the regular expression pattern in the
|
||||
folder or file path.
|
||||
|
||||
EOD;
|
||||
}
|
||||
/**
|
||||
* Used to clean trailing whitespace from files in a folder or just from
|
||||
* a file given in the command line. If also removes final ?> characters
|
||||
* to make php files conform with suggested coding guidelines. Similarly,
|
||||
* adds a space between if, for, foreach, etc and ( if not present to make
|
||||
* match PHP coding guidelines
|
||||
*
|
||||
* @param array $args $args[0] contains path to sub-folder/file
|
||||
* @return bool $no_instructions false if should output CodeTool.php
|
||||
* instructions
|
||||
*/
|
||||
function clean($args)
|
||||
{
|
||||
global $num_spaces_tab;
|
||||
$no_instructions = false;
|
||||
if (isset($args[0])) {
|
||||
$path = realpath($args[0]);
|
||||
$no_instructions = true;
|
||||
mapPath($path, C\NS_EXEC . "cleanLinesFile");
|
||||
}
|
||||
return $no_instructions;
|
||||
}
|
||||
/**
|
||||
* Updates the copyright info (assuming in Yioop docs format) on files
|
||||
* in supplied sub-folder/file. That is, it changes strings matching
|
||||
* /2009 - \d\d\d\d/ to 2009 - current_year in those files/file.
|
||||
*
|
||||
* @param array $args $args[0] contains path to sub-folder/file
|
||||
* @return bool $no_instructions false if should output CodeTool.php
|
||||
* instructions
|
||||
*/
|
||||
function copyright($args)
|
||||
{
|
||||
$no_instructions = false;
|
||||
if (isset($args[0])) {
|
||||
$path = realpath($args[0]);
|
||||
$year = date("Y");
|
||||
$out_year = "2009 - ".$year;
|
||||
replaceFile("", "/2009 \- \d\d\d\d/", $out_year, "change");
|
||||
// initialize callback
|
||||
mapPath($path, C\NS_EXEC . "replaceFile");
|
||||
$no_instructions = true;
|
||||
}
|
||||
return $no_instructions;
|
||||
}
|
||||
/**
|
||||
* Search and echos line numbers and lines for lines of length greater than 80
|
||||
* characters in files in supplied sub-folder/file,
|
||||
*
|
||||
* @param array $args $args[0] contains path to sub-folder/file
|
||||
* @return bool $no_instructions false if should output CodeTool.php
|
||||
* instructions
|
||||
*/
|
||||
function longlines($args)
|
||||
{
|
||||
global $change_extensions;
|
||||
$no_instructions = false;
|
||||
$change_extensions = array_diff($change_extensions, ["ini", "xml"]);
|
||||
if (isset($args[0])) {
|
||||
$path = realpath($args[0]);
|
||||
searchFile("", "/([^\n]){81}/u");// initialize callback
|
||||
mapPath($path, C\NS_EXEC . "searchFile");
|
||||
$no_instructions = true;
|
||||
}
|
||||
return $no_instructions;
|
||||
}
|
||||
/**
|
||||
* Performs a search and replace for given pattern in files in supplied
|
||||
* sub-folder/file
|
||||
*
|
||||
* @param array $args $args[0] contains path to sub-folder/file,
|
||||
* $args[1] contains the regex searching for, $args[2] contains
|
||||
* what it should be replaced with, $args[3] (defaults to effect)
|
||||
* controls the mode of operation. One of "effect", "change", or
|
||||
* "interactive". effect shows line number and lines matching pattern,
|
||||
* but commits no changes; interactive for each match, prompts user
|
||||
* if should do the change, change does a global search and replace
|
||||
* without output
|
||||
* @return bool $no_instructions false if should output CodeTool.php
|
||||
* instructions
|
||||
*/
|
||||
function replace($args)
|
||||
{
|
||||
$no_instructions = false;
|
||||
if (isset($args[0]) && isset($args[1]) && isset($args[2])) {
|
||||
$path = realpath($args[0]);
|
||||
$no_instructions = true;
|
||||
$pattern = $args[1];
|
||||
$replace = $args[2];
|
||||
$mode = (isset($args[3])) ? $args[3] : "effect";
|
||||
$len = strlen($pattern);
|
||||
if ($len >= 2) {
|
||||
$pattern = preg_quote($pattern,"@");
|
||||
$pattern = "@$pattern@";
|
||||
replaceFile("", $pattern, $replace, $mode); // initialize callback
|
||||
|
||||
mapPath($path, C\NS_EXEC . "replaceFile");
|
||||
}
|
||||
}
|
||||
return $no_instructions;
|
||||
}
|
||||
/**
|
||||
* Performs a search for given pattern in files in supplied sub-folder/file
|
||||
*
|
||||
* @param array $args $args[0] contains path to sub-folder/file,
|
||||
* $args[1] contains the regex searching for
|
||||
* @return bool $no_instructions false if should output CodeTool.php
|
||||
* instructions
|
||||
*/
|
||||
function search($args)
|
||||
{
|
||||
$no_instructions = false;
|
||||
if (isset($args[0]) && isset($args[1])) {
|
||||
$path = realpath($args[0]);
|
||||
$no_instructions = true;
|
||||
$pattern = $args[1];
|
||||
$len = strlen($pattern);
|
||||
if ($len >= 2) {
|
||||
$pattern = preg_quote($pattern, "@");
|
||||
$pattern = "@$pattern@";
|
||||
searchFile("", $pattern); // initialize callback
|
||||
mapPath($path, C\NS_EXEC . "searchFile");
|
||||
}
|
||||
}
|
||||
return $no_instructions;
|
||||
}
|
||||
/**
|
||||
* Callback function applied to each file in the directory being traversed
|
||||
* by @see copyright(). It checks if the files is of the extension of a code
|
||||
* file and if so trims whitespace from its lines and then updates the lines
|
||||
* of the form 2009 - \d\d\d\d to the supplied copyright year
|
||||
*
|
||||
* @param string $filename name of file to check for copyright lines and updated
|
||||
* @param mixed $set_year if false then set the end of the copyright period
|
||||
* to the current year, otherwise, if an int sets it to the value of the int
|
||||
*/
|
||||
function changeCopyrightFile($filename, $set_year = false)
|
||||
{
|
||||
global $change_extensions;
|
||||
static $year = 2014;
|
||||
if ($set_year) {
|
||||
$year = $set_year;
|
||||
}
|
||||
$path_parts = pathinfo($filename);
|
||||
$extension = $path_parts['extension'];
|
||||
if (!excludedPath($filename) && in_array($extension, $change_extensions)) {
|
||||
$lines = file($filename);
|
||||
$out_lines = [];
|
||||
$num_lines = count($lines);
|
||||
|
||||
$change = false;
|
||||
foreach ($lines as $line) {
|
||||
$new_line = preg_replace("/2009 \- \d\d\d\d/", $out_year,
|
||||
$line);
|
||||
$out_lines[] = $new_line;
|
||||
if (strcmp($new_line, $line) != 0) {
|
||||
$change = true;
|
||||
}
|
||||
}
|
||||
$out_file = implode("\n", $out_lines);
|
||||
if ($change) {
|
||||
file_put_contents($filename, $out_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Callback function applied to each file in the directory being traversed
|
||||
* by @see clean().
|
||||
*
|
||||
* @param string $filename name of file to clean lines for
|
||||
*/
|
||||
function cleanLinesFile($filename)
|
||||
{
|
||||
global $change_extensions;
|
||||
global $num_spaces_tab;
|
||||
$spaces = str_repeat(" ", $num_spaces_tab);
|
||||
$path_parts = pathinfo($filename);
|
||||
$extension = $path_parts['extension'];
|
||||
if (!excludedPath($filename) && in_array($extension, $change_extensions)) {
|
||||
$lines = file($filename);
|
||||
$out_lines = [];
|
||||
$change = false;
|
||||
$i = 0;
|
||||
foreach ($lines as $line) {
|
||||
$new_line = preg_replace("/\t/", $spaces, $line);
|
||||
$count = 0;
|
||||
$new_line = preg_replace('/(if|elseif|else|switch|case|".
|
||||
"while|foreach|for|catch)\(/', "$1 (", $new_line);
|
||||
$new_line = rtrim($new_line);
|
||||
$out_lines[] = $new_line;
|
||||
if (strcmp($new_line."\n", $line) != 0) {
|
||||
$change = true;
|
||||
}
|
||||
$i++;
|
||||
}
|
||||
$last_line = $i - 1;
|
||||
if ($new_line == '?>') {
|
||||
$change = true;
|
||||
$out_lines[$last_line] = "\n";
|
||||
}
|
||||
$out_file = implode("\n", $out_lines);
|
||||
if ($change) {
|
||||
file_put_contents($filename, $out_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Callback function applied to each file in the directory being traversed
|
||||
* by @see search(). Searches $filename matching $pattern and outputs line
|
||||
* numbers and lines
|
||||
*
|
||||
* @param string $filename name of file to search in
|
||||
* @param mixed $set_pattern if not false, then sets $set_pattern in $pattern to
|
||||
* initialize the callback on subsequent calls. $pattern here is the
|
||||
* search pattern
|
||||
*/
|
||||
function searchFile($filename, $set_pattern = false)
|
||||
{
|
||||
global $change_extensions;
|
||||
static $pattern = "/";
|
||||
if ($set_pattern) {
|
||||
$pattern = $set_pattern;
|
||||
}
|
||||
$path_parts = pathinfo($filename);
|
||||
if (!isset($path_parts['extension'])) {
|
||||
return;
|
||||
}
|
||||
$extension = $path_parts['extension'];
|
||||
if (!excludedPath($filename) && in_array($extension, $change_extensions)) {
|
||||
$lines = file($filename);
|
||||
$no_output = true;
|
||||
$num = 0;
|
||||
foreach ($lines as $line) {
|
||||
$num++;
|
||||
if (preg_match($pattern, $line)) {
|
||||
if ($no_output) {
|
||||
$no_output = false;
|
||||
echo "\nIn $filename:\n";
|
||||
}
|
||||
echo " Line $num: $line";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Callback function applied to each file in the directory being traversed
|
||||
* by @see replace(). Searches $filename matching $pattern. Depending
|
||||
* on $mode ($arg[2] as described in replace()), it outputs and
|
||||
* replaces with $replace
|
||||
*
|
||||
* @param string $filename name of file to search and replace in
|
||||
* @param mixed $set_pattern if not false, then sets $set_pattern in $pattern to
|
||||
* initialize the callback on subsequent calls. $pattern here is the
|
||||
* search pattern
|
||||
* @param mixed $set_replace if not false, then sets $set_replace in $replace to
|
||||
* initialize the callback on subsequent calls.
|
||||
* @param mixed $set_mode if not false, then sets $set_mode in $mode to
|
||||
* initialize the callback on subsequent calls.
|
||||
*/
|
||||
function replaceFile($filename, $set_pattern = false,
|
||||
$set_replace = false, $set_mode = false)
|
||||
{
|
||||
global $change_extensions;
|
||||
static $pattern = "/";
|
||||
static $replace = "";
|
||||
static $mode = "effect";
|
||||
|
||||
$pattern = ($set_pattern) ? $set_pattern : $pattern;
|
||||
$replace = ($set_replace) ? $set_replace : $replace;
|
||||
$mode = ($set_mode) ? $set_mode : $mode;
|
||||
|
||||
$path_parts = pathinfo($filename);
|
||||
if (!isset($path_parts['extension'])) {
|
||||
return;
|
||||
}
|
||||
$extension = $path_parts['extension'];
|
||||
if (!excludedPath($filename) && in_array($extension, $change_extensions)) {
|
||||
$lines = file($filename);
|
||||
$out_lines = "";
|
||||
$no_output = true;
|
||||
$silent = false;
|
||||
if ($mode == "change") {
|
||||
$silent = true;
|
||||
}
|
||||
$num = 0;
|
||||
$change = false;
|
||||
foreach ($lines as $line) {
|
||||
$num++;
|
||||
$new_line = $line;
|
||||
if (preg_match($pattern, $line)) {
|
||||
if ($no_output && !$silent) {
|
||||
$no_output = false;
|
||||
echo "\nIn $filename:\n";
|
||||
}
|
||||
$new_line = preg_replace($pattern, $replace, $line);
|
||||
if (!$silent) {
|
||||
echo " Line $num: $line";
|
||||
echo " Changes to: $new_line";
|
||||
}
|
||||
if ($mode == "interactive") {
|
||||
echo "Do replacement? (Yy - yes, anything else no): ";
|
||||
$confirm = strtolower(readInput());
|
||||
if ($confirm != "y") {
|
||||
$new_line = $line;
|
||||
}
|
||||
}
|
||||
if (strcmp($new_line, $line) != 0) {
|
||||
$change = true;
|
||||
}
|
||||
}
|
||||
$out_lines .= $new_line;
|
||||
}
|
||||
if (in_array($mode, ["change", "interactive"])) {
|
||||
if ($change) {
|
||||
file_put_contents($filename, $out_lines);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Applies the function $callback to each file in $path
|
||||
*
|
||||
* @param string $path to apply map $callback to
|
||||
* @param string $callback function name to call with filename of each file
|
||||
* in path
|
||||
*/
|
||||
function mapPath($path, $callback)
|
||||
{
|
||||
global $db;
|
||||
if (is_dir($path)) {
|
||||
$db->traverseDirectory($path, $callback, true);
|
||||
} else {
|
||||
$callback($path);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Checks if $path is amongst a list of paths which should be ignored
|
||||
*
|
||||
* @param $path a directory path
|
||||
* @return bool whether or not it should be ignored (true == ignore)
|
||||
*/
|
||||
function excludedPath($path)
|
||||
{
|
||||
global $exclude_paths_containing;
|
||||
|
||||
foreach ($exclude_paths_containing as $exclude) {
|
||||
if (strstr($path, $exclude)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
2946
src/executables/Fetcher.php
Normal file
2946
src/executables/Fetcher.php
Normal file
File diff suppressed because it is too large
Load diff
215
src/executables/MediaUpdater.php
Normal file
215
src/executables/MediaUpdater.php
Normal file
|
@ -0,0 +1,215 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\executables;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\CrawlDaemon;
|
||||
use seekquarry\yioop\library\FetchUrl;
|
||||
use seekquarry\yioop\library\MediaConstants;
|
||||
use seekquarry\yioop\library\media_jobs\MediaJob;
|
||||
use seekquarry\yioop\library\WikiParser;
|
||||
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
ini_set("memory_limit", "1300M");
|
||||
/** We do want logging, but crawl model and other will try to turn off
|
||||
* if we don't set this
|
||||
*/
|
||||
define("seekquarry\\yioop\\configs\\NO_LOGGING", false);
|
||||
/** To guess language based on page encoding */
|
||||
require_once __DIR__."/../library/LocaleFunctions.php";
|
||||
if (!C\PROFILE) {
|
||||
echo "Please configure the search engine instance by visiting" .
|
||||
"its web interface on localhost.\n";
|
||||
exit();
|
||||
}
|
||||
/*
|
||||
* We'll set up multi-byte string handling to use UTF-8
|
||||
*/
|
||||
mb_internal_encoding("UTF-8");
|
||||
mb_regex_encoding("UTF-8");
|
||||
/**
|
||||
* Separate process/command-line script which can be used to update
|
||||
* news sources for Yioop and also handle other kinds of activities such as
|
||||
* video conversion. This is as an alternative to using the web app
|
||||
* for updating. Makes use of the web-apps code.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class MediaUpdater implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Shortest time through one iteration of news updater's loop
|
||||
*/
|
||||
const MINIMUM_UPDATE_LOOP_TIME = 10;
|
||||
/**
|
||||
* The last time feeds were checked for updates
|
||||
* @var int
|
||||
*/
|
||||
public $update_time;
|
||||
/**
|
||||
* If true then it is assumed that mail should be
|
||||
* sent using a media updater rather than from within the web app
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
public $mail_mode;
|
||||
/**
|
||||
* Controls whether media updating should be viewed as only occurring
|
||||
* on the name server or should it be viewed as a distributed process
|
||||
* amongst all machines in this Yioop instance
|
||||
* @var string
|
||||
*/
|
||||
public $media_mode;
|
||||
/**
|
||||
* List of job this media updater performs
|
||||
* @var array
|
||||
*/
|
||||
public $jobs;
|
||||
/**
|
||||
* Sets up the field variables so that media updating can begin
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
global $argv;
|
||||
$this->delete_time = 0;
|
||||
$this->retry_time = 0;
|
||||
$this->update_time = 0;
|
||||
$this->media_mode = "name_server";
|
||||
$this->media_mode = false;
|
||||
}
|
||||
/**
|
||||
* This is the function that should be called to get the MediaUpdater to
|
||||
* start to start updating. Calls init to handle the command-line
|
||||
* arguments then enters news_updaters main loop
|
||||
*/
|
||||
public function start()
|
||||
{
|
||||
global $argv;
|
||||
CrawlDaemon::init($argv, "MediaUpdater");
|
||||
L\crawlLog("\n\nInitialize logger..", "MediaUpdater", true);
|
||||
L\crawlLog("Acquiring list of jobs...");
|
||||
$job_path = C\BASE_DIR ."/library/media_jobs/";
|
||||
$app_job_path = C\APP_DIR ."/library/media_jobs/";
|
||||
$len_path = strlen($job_path);
|
||||
if (empty($argv[2])) {
|
||||
$base_job_files = glob("$job_path*Job.php");
|
||||
$job_files = glob("$app_job_path*Job.php");
|
||||
foreach ($base_job_files as $job_file) {
|
||||
$app_equiv_job = $app_job_path . substr($job_file, $len_path);
|
||||
if (!in_array($app_equiv_job, $job_files)) {
|
||||
$job_files[] = $job_file;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$job_files = [];
|
||||
$pre_jobs = array_slice($argv, 2);
|
||||
foreach ($pre_jobs as $pre_job) {
|
||||
if (file_exists($app_job_path . "{$pre_job}Job.php")) {
|
||||
$job_files[] = $app_job_path . "{$pre_job}Job.php";
|
||||
} else if (file_exists($job_path . "{$pre_job}Job.php")) {
|
||||
$job_files[] = $job_path . "{$pre_job}Job.php";
|
||||
}
|
||||
}
|
||||
}
|
||||
foreach ($job_files as $job_file) {
|
||||
require_once $job_file;
|
||||
$job_name = C\NS_JOBS . substr($job_file, $len_path, -4);
|
||||
if ($job_name != C\NS_JOBS . "MediaJob") {
|
||||
$job = new $job_name($this);
|
||||
$this->jobs[] = $job;
|
||||
L\crawlLog("... loading $job_name");
|
||||
}
|
||||
}
|
||||
$this->loop();
|
||||
}
|
||||
/**
|
||||
* Main loop for the news updater.
|
||||
*/
|
||||
public function loop()
|
||||
{
|
||||
L\crawlLog("In Media Update Loop");
|
||||
L\crawlLog("PHP Version in use: " . phpversion());
|
||||
$info[self::STATUS] = self::CONTINUE_STATE;
|
||||
$local_archives = [""];
|
||||
while (CrawlDaemon::processHandler()) {
|
||||
$start_time = microtime(true);
|
||||
$this->getUpdateProperties();
|
||||
if (!empty($this->jobs)) {
|
||||
foreach ($this->jobs as $job) {
|
||||
$job->run();
|
||||
}
|
||||
}
|
||||
$sleep_time = max(0, ceil(self::MINIMUM_UPDATE_LOOP_TIME -
|
||||
L\changeInMicrotime($start_time)));
|
||||
if ($sleep_time > 0) {
|
||||
L\crawlLog("Ensure minimum loop time by sleeping...".
|
||||
$sleep_time);
|
||||
sleep($sleep_time);
|
||||
}
|
||||
} //end while
|
||||
L\crawlLog("Media Updater shutting down!!");
|
||||
}
|
||||
/**
|
||||
* Makes a request to the name server to find out if we are running
|
||||
* as a media updater just on the name server or on both the name server
|
||||
* as well as all other machines in the Yioop instance
|
||||
*/
|
||||
public function getUpdateProperties()
|
||||
{
|
||||
L\crawlLog("Checking Name Server for Media Updater properties...");
|
||||
$current_machine = MediaJob::getCurrentMachine();
|
||||
$properties = MediaJob::execNameServer(
|
||||
"getUpdateProperties");
|
||||
if ($properties) {
|
||||
if (isset($properties['MEDIA_MODE'])) {
|
||||
$this->media_mode = $properties['MEDIA_MODE'];
|
||||
L\crawlLog("...Setting media mode to: " .
|
||||
$properties['MEDIA_MODE']);
|
||||
}
|
||||
if (isset($properties['SEND_MAIL_MEDIA_UPDATER'])) {
|
||||
$this->mail_mode = (
|
||||
$properties['SEND_MAIL_MEDIA_UPDATER']== "true") ?
|
||||
true : false;
|
||||
L\crawlLog("...Setting mail mode to: " .
|
||||
(($this->mail_mode) ? "true" : "false"));
|
||||
}
|
||||
}
|
||||
L\crawlLog("Done checking Name Server for Media Updater properties");
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Instantiate and run the MediaUpdater program
|
||||
*/
|
||||
$media_updater = new MediaUpdater();
|
||||
$media_updater->start();
|
||||
|
338
src/executables/Mirror.php
Normal file
338
src/executables/Mirror.php
Normal file
|
@ -0,0 +1,338 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\executables;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\CrawlDaemon;
|
||||
use seekquarry\yioop\library\FetchUrl;
|
||||
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
ini_set("memory_limit","850M"); //so have enough memory to crawl big pages
|
||||
|
||||
/** CRAWLING means don't try to use memcache
|
||||
* @ignore
|
||||
*/
|
||||
define("seekquarry\\yioop\\configs\\NO_CACHE", true);
|
||||
/** for crawlHash and crawlLog and Yioop constants */
|
||||
require_once __DIR__."/../library/Utility.php";
|
||||
if (!C\PROFILE) {
|
||||
echo "Please configure the search engine instance by visiting" .
|
||||
"its web interface on localhost.\n";
|
||||
exit();
|
||||
}
|
||||
/*
|
||||
* We'll set up multi-byte string handling to use UTF-8
|
||||
*/
|
||||
mb_internal_encoding("UTF-8");
|
||||
mb_regex_encoding("UTF-8");
|
||||
/**
|
||||
* This class is responsible for syncing crawl archives between machines using
|
||||
* the SeekQuarry/Yioop search engine
|
||||
*
|
||||
* Mirror periodically queries the queue server asking for a list of files that
|
||||
* have changed in its parent since the last sync time. It then proceeds to
|
||||
* download them.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class Mirror implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Reference to a database object. Used since has directory manipulation
|
||||
* functions
|
||||
* @var object
|
||||
*/
|
||||
public $db;
|
||||
/**
|
||||
* Url or IP address of the name_server to get sites to crawl from
|
||||
* @var string
|
||||
*/
|
||||
public $name_server;
|
||||
|
||||
/**
|
||||
* Last time a sync list was obtained from master machines
|
||||
* @var string
|
||||
*/
|
||||
public $last_sync;
|
||||
/**
|
||||
* Last time the machine being mirrored was notified Mirror.php is still
|
||||
* running
|
||||
* @var string
|
||||
*/
|
||||
public $last_notify;
|
||||
/**
|
||||
* File name where last sync time is written
|
||||
* @var string
|
||||
*/
|
||||
public $last_sync_file;
|
||||
/**
|
||||
* Time of start of current sync
|
||||
* @var string
|
||||
*/
|
||||
public $start_sync;
|
||||
/**
|
||||
* Files to download for current sync
|
||||
* @var string
|
||||
*/
|
||||
public $sync_schedule;
|
||||
/**
|
||||
* Directory to sync
|
||||
* @var string
|
||||
*/
|
||||
public $sync_dir;
|
||||
/**
|
||||
* Url of the Yioop instance we are mirroring
|
||||
* @var string
|
||||
*/
|
||||
public $parent_url;
|
||||
/**
|
||||
* Maximum number of bytes from a file to download in one go
|
||||
*/
|
||||
const DOWNLOAD_RANGE = 50000000;
|
||||
/**
|
||||
* Sets up the field variables so that syncing can begin
|
||||
*
|
||||
* @param string $name_server URL or IP address of the name server
|
||||
*/
|
||||
public function __construct($name_server)
|
||||
{
|
||||
$db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager";
|
||||
$this->db = new $db_class();
|
||||
$this->name_server = $name_server;
|
||||
$this->last_sync_file = C\CRAWL_DIR."/schedules/last_sync.txt";
|
||||
if (file_exists($this->last_sync_file)) {
|
||||
$this->last_sync = unserialize(
|
||||
file_get_contents($this->last_sync_file));
|
||||
} else {
|
||||
$this->last_sync = 0;
|
||||
}
|
||||
$this->start_sync = $this->last_sync;
|
||||
$this->last_notify = $this->last_sync;
|
||||
$this->sync_schedule = [];
|
||||
$this->sync_dir = C\CRAWL_DIR."/cache";
|
||||
$this->parent_url = $name_server;
|
||||
}
|
||||
/**
|
||||
* This is the function that should be called to get the mirror to start
|
||||
* syncing. Calls init to handle the command line arguments then enters
|
||||
* the syncer's main loop
|
||||
*/
|
||||
public function start()
|
||||
{
|
||||
global $argv;
|
||||
CrawlDaemon::init($argv, "Mirror");
|
||||
L\crawlLog("\n\nInitialize logger..", "mirror", true);
|
||||
$this->loop();
|
||||
}
|
||||
/**
|
||||
* Main loop for the mirror script.
|
||||
*
|
||||
*/
|
||||
public function loop()
|
||||
{
|
||||
L\crawlLog("In Sync Loop");
|
||||
L\crawlLog("PHP Version in use: " . phpversion());
|
||||
$info[self::STATUS] = self::CONTINUE_STATE;
|
||||
while (CrawlDaemon::processHandler()) {
|
||||
$syncer_message_file = C\CRAWL_DIR .
|
||||
"/schedules/MirrorMessages.txt";
|
||||
if (file_exists($syncer_message_file)) {
|
||||
$info = unserialize(file_get_contents($syncer_message_file));
|
||||
unlink($syncer_message_file);
|
||||
if (isset($info[self::STATUS]) &&
|
||||
$info[self::STATUS] == self::STOP_STATE) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
$parent_file = C\CRAWL_DIR . "/schedules/mirror_parent.txt";
|
||||
if (file_exists($parent_file)) {
|
||||
$this->parent_url = file_get_contents($parent_file);
|
||||
L\crawlLog("Read File: " . $parent_file . ".");
|
||||
L\crawlLog("Set parent server to: " . $this->parent_url);
|
||||
} else {
|
||||
L\crawlLog("File: " . $parent_file . " does not exist!");
|
||||
L\crawlLog("Assuming parent is name server: ".
|
||||
$this->name_server);
|
||||
$this->parent_url = $this->name_server;
|
||||
}
|
||||
$info = $this->checkScheduler();
|
||||
if ($info === false) {
|
||||
L\crawlLog("Cannot connect to parent server...".
|
||||
" will try again in ".
|
||||
C\MIRROR_NOTIFY_FREQUENCY." seconds.");
|
||||
sleep(C\MIRROR_NOTIFY_FREQUENCY);
|
||||
continue;
|
||||
}
|
||||
if ($info[self::STATUS] == self::NO_DATA_STATE) {
|
||||
L\crawlLog("No data from parent server. Sleeping...");
|
||||
sleep(C\MIRROR_NOTIFY_FREQUENCY);
|
||||
continue;
|
||||
}
|
||||
$this->copyNextSyncFile();
|
||||
} //end while
|
||||
L\crawlLog("Mirror shutting down!!");
|
||||
}
|
||||
/**
|
||||
* Gets status and, if done processing all other mirroring activities,
|
||||
* gets a new list of files that have changed since the last synchronization
|
||||
* from the web app of the machine we are mirroring with.
|
||||
*
|
||||
* @return mixed array or bool. Returns false if weren't successful in
|
||||
* contacting web app, otherwise, returns an array with a status
|
||||
* and potentially a list of files ot sync
|
||||
*/
|
||||
public function checkScheduler()
|
||||
{
|
||||
$info = [];
|
||||
$server = $this->parent_url;
|
||||
$start_time = microtime(true);
|
||||
$time = time();
|
||||
$session = md5($time . C\AUTH_KEY);
|
||||
$write_sync_time = true;
|
||||
$request =
|
||||
$server.
|
||||
"?c=resource&time=$time&session=$session".
|
||||
"&robot_instance=".C\ROBOT_INSTANCE."&machine_uri=".C\WEB_URI.
|
||||
"&last_sync=".$this->last_sync;
|
||||
if ($this->start_sync <= $this->last_sync &&
|
||||
$this->last_sync + C\MIRROR_SYNC_FREQUENCY < $time) {
|
||||
$request .= "&a=syncList";
|
||||
L\crawlLog("Getting Sync List...");
|
||||
$info_string = FetchUrl::getPage($request, null, true);
|
||||
if ($info_string === false) {
|
||||
return false;
|
||||
}
|
||||
$this->last_notify = $time;
|
||||
$info_string = trim($info_string);
|
||||
$info = unserialize(gzuncompress(base64_decode($info_string)));
|
||||
if (isset($info[self::STATUS]) &&
|
||||
$info[self::STATUS] == self::CONTINUE_STATE) {
|
||||
$this->start_sync = $time;
|
||||
$this->sync_schedule = $info[self::DATA];
|
||||
unset($info[self::DATA]);
|
||||
} else if (isset($info[self::STATUS]) &&
|
||||
$info[self::STATUS] == self::NO_DATA_STATE) {
|
||||
$this->last_sync = $time;
|
||||
$this->start_sync = $time;
|
||||
$write_sync_time = false;
|
||||
}
|
||||
} else {
|
||||
$info[self::STATUS] = ($this->last_sync == $this->start_sync) ?
|
||||
self::NO_DATA_STATE : self::CONTINUE_STATE;
|
||||
L\crawlLog("Current time $time, last notify time ".
|
||||
$this->last_notify."...");
|
||||
if ($time - $this->last_notify > C\MIRROR_NOTIFY_FREQUENCY) {
|
||||
$request .= "&a=syncNotify";
|
||||
FetchUrl::getPage($request, null, true);
|
||||
$this->last_notify = $time;
|
||||
L\crawlLog("Notifying master that mirror is alive..");
|
||||
} else {
|
||||
L\crawlLog("So not notifying scheduler..");
|
||||
}
|
||||
}
|
||||
if (count($this->sync_schedule) == 0 && $write_sync_time) {
|
||||
$this->last_sync = $this->start_sync;
|
||||
$this->db->setWorldPermissionsRecursive($this->sync_dir, true);
|
||||
file_put_contents($this->last_sync_file,
|
||||
serialize($this->last_sync));
|
||||
}
|
||||
L\crawlLog(" Time to check Scheduler ".
|
||||
L\changeInMicrotime($start_time));
|
||||
return $info;
|
||||
}
|
||||
/**
|
||||
* Downloads the next file from the schedule of files to download received
|
||||
* from the web app.
|
||||
*/
|
||||
public function copyNextSyncFile()
|
||||
{
|
||||
$dir = $this->sync_dir;
|
||||
$server = $this->parent_url;
|
||||
$time = time();
|
||||
$session = md5($time . C\AUTH_KEY);
|
||||
if (count($this->sync_schedule) <= 0) return;
|
||||
$file = array_pop($this->sync_schedule);
|
||||
L\crawlLog("Start syncing {$file['name']}..");
|
||||
if ($file['is_dir'] ) {
|
||||
if (!file_exists("$dir/{$file['name']}")) {
|
||||
mkdir("$dir/{$file['name']}");
|
||||
L\crawlLog(".. {$file['name']} directory created.");
|
||||
} else {
|
||||
L\crawlLog(".. {$file['name']} directory exists.");
|
||||
}
|
||||
} else {
|
||||
$request =
|
||||
"$server?c=resource&a=get&time=$time&session=$session".
|
||||
"&f=cache&n=" . urlencode($file["name"]);
|
||||
if ($file["size"] < self::DOWNLOAD_RANGE) {
|
||||
$data = FetchUrl::getPage($request, null, true);
|
||||
if ($file["size"] != strlen($data)) {
|
||||
array_push($this->sync_schedule, $file);
|
||||
L\crawlLog(".. {$file['name']} error ".
|
||||
"downloading, retrying.");
|
||||
return;
|
||||
}
|
||||
file_put_contents("$dir/{$file['name']}", $data);
|
||||
L\crawlLog(".. {$file['name']} file copied.");
|
||||
} else {
|
||||
$offset = 0;
|
||||
$fh = fopen("$dir/{$file['name']}", "wb");
|
||||
$request .= "&l=".self::DOWNLOAD_RANGE;
|
||||
while($offset < $file['size']) {
|
||||
$data = FetchUrl::getPage($request."&o=$offset", null,
|
||||
true);
|
||||
$old_offset = $offset;
|
||||
$offset += self::DOWNLOAD_RANGE;
|
||||
$end_point = min($offset, $file["size"]);
|
||||
//crude check if we need to redownload segment
|
||||
if (strlen($data) != ($end_point - $old_offset)) {
|
||||
$offset = $old_offset;
|
||||
L\crawlLog(".. Download error re-requesting segment");
|
||||
continue;
|
||||
}
|
||||
fwrite($fh, $data);
|
||||
L\crawlLog(".. {$file['name']} downloaded bytes $old_offset ".
|
||||
"to $end_point..");
|
||||
}
|
||||
L\crawlLog(".. {$file['name']} file copied.");
|
||||
fclose($fh);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Instantiate and runs the Mirror program
|
||||
*/
|
||||
$syncer = new Mirror(C\NAME_SERVER);
|
||||
$syncer->start();
|
||||
|
152
src/executables/QueryTool.php
Normal file
152
src/executables/QueryTool.php
Normal file
|
@ -0,0 +1,152 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\executables;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
use seekquarry\yioop\library\FileCache;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\controllers\SearchController;
|
||||
|
||||
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
|
||||
/** so can output plans */
|
||||
define("seekquarry\\yioop\\configs\\QUERY_STATISTICS", true);
|
||||
/** Loads common constants for web crawling*/
|
||||
require_once __DIR__."/../library/LocaleFunctions.php";
|
||||
if (!C\PROFILE) {
|
||||
echo "Please configure the search engine instance by visiting" .
|
||||
"its web interface on localhost.\n";
|
||||
exit();
|
||||
}
|
||||
/*
|
||||
* We'll set up multi-byte string handling to use UTF-8
|
||||
*/
|
||||
mb_internal_encoding("UTF-8");
|
||||
mb_regex_encoding("UTF-8");
|
||||
/**
|
||||
* Tool to provide a command line query interface to indexes stored in
|
||||
* Yioop! database. Running with no arguments gives a help message for
|
||||
* this tool.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class QueryTool implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Initializes the QueryTool, for now does nothing
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
}
|
||||
/**
|
||||
* Runs the QueryTool on the supplied command line arguments
|
||||
*/
|
||||
public function start()
|
||||
{
|
||||
global $argv;
|
||||
if (!isset($argv[1])) {
|
||||
$this->usageMessageAndExit();
|
||||
}
|
||||
$query = $argv[1];
|
||||
$results_per_page = (isset($argv[2])) ? $argv[2] : 10;
|
||||
$limit = (isset($argv[3])) ? $argv[3] : 0;
|
||||
L\setLocaleObject((isset($argv[4])) ? $argv[4] : C\DEFAULT_LOCALE);
|
||||
$start_time = microtime(true);
|
||||
$controller = new SearchController();
|
||||
$data = $controller->queryRequest($query, $results_per_page, $limit);
|
||||
if (isset($argv[2]) && ($argv[2] == "plan" || $argv[2] == "explain")) {
|
||||
echo "\n" . $controller->model("phrase")->db->query_log[0]["PLAN"]
|
||||
."\n";
|
||||
exit();
|
||||
}
|
||||
if (!isset($data['PAGES'])) {
|
||||
$data['PAGES'] = [];
|
||||
}
|
||||
foreach ($data['PAGES'] as $page) {
|
||||
echo "============\n";
|
||||
echo "TITLE: ". trim($page[self::TITLE]). "\n";
|
||||
echo "URL: ". trim($page[self::URL]). "\n";
|
||||
echo "IPs: ";
|
||||
if (isset($page[self::IP_ADDRESSES])) {
|
||||
foreach ($page[self::IP_ADDRESSES] as $address) {
|
||||
echo $address." ";
|
||||
}
|
||||
}
|
||||
echo "\n";
|
||||
echo "DESCRIPTION: ".wordwrap(trim($page[self::DESCRIPTION]))."\n";
|
||||
echo "Rank: ".$page[self::DOC_RANK]."\n";
|
||||
echo "Relevance: ".$page[self::RELEVANCE]."\n";
|
||||
echo "Proximity: ".$page[self::PROXIMITY]."\n";
|
||||
echo "Score: ".$page[self::SCORE]."\n";
|
||||
echo "============\n\n";
|
||||
}
|
||||
$data['ELAPSED_TIME'] = L\changeInMicrotime($start_time);
|
||||
echo "QUERY STATISTICS\n";
|
||||
|
||||
echo "============\n";
|
||||
echo "ELAPSED TIME: ".$data['ELAPSED_TIME']."\n";
|
||||
if (isset($data['LIMIT'])) {
|
||||
echo "LOW: ".$data['LIMIT']."\n";
|
||||
}
|
||||
if (isset($data['HIGH'])) {
|
||||
echo "HIGH: ".min($data['TOTAL_ROWS'],
|
||||
$data['LIMIT'] + $data['RESULTS_PER_PAGE'])."\n";
|
||||
}
|
||||
if (isset($data['TOTAL_ROWS'])) {
|
||||
echo "TOTAL ROWS: ".$data['TOTAL_ROWS']."\n";
|
||||
}
|
||||
if (isset($data['ERROR'])) {
|
||||
echo $data['ERROR']."\n";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Outputs the "how to use this tool message" and then exit()'s.
|
||||
*/
|
||||
public function usageMessageAndExit()
|
||||
{
|
||||
echo "\nQueryTool.php is used to run a Yioop";
|
||||
echo " query from the command line.\n For example,\n";
|
||||
echo " php QueryTool.php 'chris pollett' \n returns results ".
|
||||
"from the default index of a search on 'chris pollett'.\n";
|
||||
echo "The general command format is:\n";
|
||||
echo " php QueryTool.php query num_results start_num lang_tag\n\n";
|
||||
echo "QueryTool.php can also be used to explain the plan by which\n";
|
||||
echo "Yioop will compute query results. For this usage one types:\n";
|
||||
echo " php QueryTool.php query plan\n";
|
||||
echo "or\n";
|
||||
echo " php QueryTool.php query explain\n";
|
||||
echo "For example,";
|
||||
echo " php QueryTool.php 'chris pollett' explain\n";
|
||||
exit();
|
||||
}
|
||||
}
|
||||
$query_tool = new QueryTool();
|
||||
$query_tool->start();
|
2642
src/executables/QueueServer.php
Normal file
2642
src/executables/QueueServer.php
Normal file
File diff suppressed because it is too large
Load diff
BIN
src/favicon.ico
Normal file
BIN
src/favicon.ico
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.1 KiB |
615
src/index.php
Normal file
615
src/index.php
Normal file
|
@ -0,0 +1,615 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* Main web interface entry point for Yioop!
|
||||
* search site. Used to both get and display
|
||||
* search results. Also used for inter-machine
|
||||
* communication during crawling
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library as L;
|
||||
|
||||
/**
|
||||
* Main entry point to the Yioop web app.
|
||||
*
|
||||
* Initialization is done in a function to avoid polluting the global
|
||||
* namespace with variables.
|
||||
*/
|
||||
function bootstrap()
|
||||
{
|
||||
/**
|
||||
* For error function and yioop constants
|
||||
*/
|
||||
require_once __DIR__ . "/library/Utility.php";
|
||||
/**
|
||||
* Did we come to this index.php from ../index.php? If so, rewriting
|
||||
* must be on
|
||||
*/
|
||||
if (!C\nsdefined("REDIRECTS_ON")) {
|
||||
C\nsdefine("REDIRECTS_ON", false);
|
||||
}
|
||||
/**
|
||||
* Check if doing url rewriting, and if so, do initial routing
|
||||
*/
|
||||
configureRewrites();
|
||||
if ((C\DEBUG_LEVEL & C\ERROR_INFO) == C\ERROR_INFO) {
|
||||
set_error_handler(C\NS_LIB . "yioop_error_handler");
|
||||
}
|
||||
/**
|
||||
* Load global functions related to localization
|
||||
*/
|
||||
require_once __DIR__."/library/LocaleFunctions.php";
|
||||
ini_set("memory_limit","500M");
|
||||
if (empty($_REQUEST['c']) || $_REQUEST['c'] != 'resource') {
|
||||
header("X-FRAME-OPTIONS: DENY"); //prevent click-jacking
|
||||
}
|
||||
header("X-Content-Type-Options: nosniff"); /*
|
||||
Let browsers know that we should be setting the mimetype correctly --
|
||||
For none dumb browsers this should help prevent against XSS attacks
|
||||
to images containing HTML. Also, might help against PRSSI attacks.
|
||||
*/
|
||||
if (session_status() == PHP_SESSION_NONE) {
|
||||
session_name(C\SESSION_NAME);
|
||||
session_start();
|
||||
}
|
||||
/**
|
||||
* Load global functions related to checking Yioop! version
|
||||
*/
|
||||
require_once C\BASE_DIR."/library/UpgradeFunctions.php";
|
||||
if (!function_exists('mb_internal_encoding')) {
|
||||
echo "PHP Zend Multibyte Support must be enabled for Yioop! to run.";
|
||||
exit();
|
||||
}
|
||||
/**
|
||||
* Make an initial setting of controllers. This can be overridden in
|
||||
* local_config
|
||||
*/
|
||||
$available_controllers = ["admin", "api", "archive", "cache",
|
||||
"classifier", "crawl", "fetch", "group", "jobs", "machine", "resource",
|
||||
"search", "settings", "static"];
|
||||
if (function_exists(C\NS_CONFIGS . "localControllers")) {
|
||||
$available_controllers = array_merge($available_controllers,
|
||||
C\localControllers());
|
||||
}
|
||||
if (in_array(C\REGISTRATION_TYPE, ['no_activation', 'email_registration',
|
||||
'admin_activation'])) {
|
||||
$available_controllers[] = "register";
|
||||
}
|
||||
if (!C\WEB_ACCESS) {
|
||||
$available_controllers = ["admin", "archive", "cache", "crawl","fetch",
|
||||
"jobs", "machine"];
|
||||
}
|
||||
//the request variable c is used to determine the controller
|
||||
if (!isset($_REQUEST['c'])) {
|
||||
$controller_name = "search";
|
||||
if (C\nsdefined('LANDING_PAGE') && C\LANDING_PAGE &&
|
||||
!isset($_REQUEST['q'])) {
|
||||
$controller_name = "static";
|
||||
$_REQUEST['c'] = "static";
|
||||
$_REQUEST['p'] = "Main";
|
||||
}
|
||||
} else {
|
||||
$controller_name = $_REQUEST['c'];
|
||||
}
|
||||
if (!in_array($controller_name, $available_controllers))
|
||||
{
|
||||
if (C\WEB_ACCESS) {
|
||||
$controller_name = "search";
|
||||
} else {
|
||||
$controller_name = "admin";
|
||||
}
|
||||
}
|
||||
// if no profile exists we force the page to be the configuration page
|
||||
if (!C\PROFILE || (C\nsdefined("FIX_NAME_SERVER") && C\FIX_NAME_SERVER)) {
|
||||
$controller_name = "admin";
|
||||
}
|
||||
$locale_tag = L\getLocaleTag();
|
||||
if (L\upgradeDatabaseWorkDirectoryCheck()) {
|
||||
/**
|
||||
* Load global functions needed to upgrade between versions
|
||||
* (note only do this if need to upgrade)
|
||||
*/
|
||||
require_once C\BASE_DIR."/library/VersionFunctions.php";
|
||||
L\upgradeDatabaseWorkDirectory();
|
||||
}
|
||||
if (L\upgradeLocalesCheck($locale_tag)) {
|
||||
L\upgradeLocales();
|
||||
}
|
||||
//upgrade manipulations might mess with globale locale, so set it back here
|
||||
L\setLocaleObject($locale_tag);
|
||||
/**
|
||||
* Loads controller responsible for calculating
|
||||
* the data needed to render the scene
|
||||
*
|
||||
*/
|
||||
$controller_class = C\NS_CONTROLLERS . ucfirst($controller_name) .
|
||||
"Controller";
|
||||
$controller = new $controller_class();
|
||||
$controller->processRequest();
|
||||
}
|
||||
/**
|
||||
* Used to setup and handles url rewriting for the Yioop Web app
|
||||
*
|
||||
* Developers can add new routes by creating a Routes class in
|
||||
* the app_dir with a static method getRoutes which should return
|
||||
* an associating array of incoming_path => handler function
|
||||
*/
|
||||
function configureRewrites()
|
||||
{
|
||||
$route_map = [
|
||||
'advertise' => 'routeDirect',
|
||||
'blog' => 'routeDirect',
|
||||
'bot' => 'routeDirect',
|
||||
'privacy' => 'routeDirect',
|
||||
'terms' => 'routeDirect',
|
||||
'admin' => 'routeController',
|
||||
'register' => 'routeController',
|
||||
'settings' => 'routeController',
|
||||
's' => "routeSubsearch",
|
||||
'more' => 'routeMore',
|
||||
'suggest' => 'routeSuggest',
|
||||
'group' => 'routeFeeds',
|
||||
'thread' => 'routeFeeds',
|
||||
'user' => 'routeFeeds',
|
||||
'p' => 'routeWiki'
|
||||
];
|
||||
if (class_exists(C\NS. "Routes")) {
|
||||
$route_map = array_merge($route_map, Routes::getRoutes());
|
||||
}
|
||||
/**
|
||||
* Check for paths of the form index.php/something which yioop doesn't
|
||||
* support
|
||||
*/
|
||||
$s_name = $_SERVER['SCRIPT_NAME']."/";
|
||||
$path_name = substr($_SERVER["REQUEST_URI"], 0, strlen($s_name));
|
||||
if (strcmp($path_name, $s_name) == 0) {
|
||||
$_SERVER["PATH_TRANSLATED"] = C\BASE_DIR;
|
||||
$scriptinfo = pathinfo($s_name);
|
||||
$_SERVER["PATH_INFO"] = ($scriptinfo["dirname"] == "/") ? "" :
|
||||
$scriptinfo["dirname"] ;
|
||||
require_once(C\BASE_DIR."/error.php");
|
||||
if (C\REDIRECTS_ON) {
|
||||
return;
|
||||
}
|
||||
exit();
|
||||
}
|
||||
if (!isset($_SERVER["PATH_INFO"])) {
|
||||
$_SERVER["PATH_INFO"] = ".";
|
||||
}
|
||||
if (!C\REDIRECTS_ON) {
|
||||
return;
|
||||
}
|
||||
/**
|
||||
* Now look for and handle routes
|
||||
*/
|
||||
$index_php = "index.php";
|
||||
$script_path = substr($_SERVER['PHP_SELF'], 0, -strlen($index_php));
|
||||
if ($_SERVER['QUERY_STRING'] == "") {
|
||||
$request_script = rtrim(
|
||||
substr($_SERVER['REQUEST_URI'], strlen($script_path)), "?");
|
||||
} else {
|
||||
$request_script = substr($_SERVER['REQUEST_URI'], strlen($script_path),
|
||||
-strlen($_SERVER['QUERY_STRING']) - 1);
|
||||
}
|
||||
$request_script = ($request_script == "") ? $index_php : $request_script;
|
||||
if (in_array($request_script, ['', $index_php])) {
|
||||
return;
|
||||
}
|
||||
$request_parts = explode("/", $request_script);
|
||||
$handled = false;
|
||||
if (isset($route_map[$request_parts[0]])) {
|
||||
if (empty($_REQUEST['c']) || $_REQUEST['c'] == $request_parts[0]) {
|
||||
$route = C\NS . $route_map[$request_parts[0]];
|
||||
$handled = $route($request_parts);
|
||||
} else if (!empty($_REQUEST['c'])) {
|
||||
$handled = true;
|
||||
}
|
||||
}
|
||||
if (!$handled) {
|
||||
$_REQUEST['p'] = "404";
|
||||
require_once __DIR__."/error.php";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to route page requests to pages that are fixed Public Group wiki
|
||||
* that should always be present. For example, 404 page.
|
||||
*
|
||||
* @param array $route_args of url parts (split on slash).
|
||||
* @return bool whether was able to compute a route or not
|
||||
*/
|
||||
function routeDirect($route_args)
|
||||
{
|
||||
$_REQUEST['route']['c'] = true;
|
||||
require_once __DIR__ . "/". $route_args[0] . ".php";
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Given the name of a fixed public group static page creates the url
|
||||
* where it can be accessed in this instance of Yioop, making use of the
|
||||
* defined variable REDIRECTS_ON.
|
||||
*
|
||||
* @param string $name of static page
|
||||
* @param bool $with_delim whether it should be terminated with nothing or
|
||||
* ? or &
|
||||
* @return string url for the page in question
|
||||
*/
|
||||
function directUrl($name, $with_delim = false)
|
||||
{
|
||||
if (C\REDIRECTS_ON) {
|
||||
$delim = ($with_delim) ? "?" : "";
|
||||
return C\BASE_URL . $name . $delim;
|
||||
} else {
|
||||
$delim = ($with_delim) ? "&" : "";
|
||||
return C\BASE_URL . "$name.php$delim";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to route page requests for pages corresponding to a group, user,
|
||||
* or thread feed. If redirects on then urls ending with /feed_type/id map
|
||||
* to a page for the id'th item of that feed_type
|
||||
*
|
||||
* @param array $route_args of url parts (split on slash).
|
||||
* @return bool whether was able to compute a route or not
|
||||
*/
|
||||
function routeFeeds($route_args)
|
||||
{
|
||||
$handled = true;
|
||||
if (isset($route_args[1]) && $route_args[1] == intval($route_args[1])) {
|
||||
$_REQUEST['c'] = "group";
|
||||
if (!empty($route_args[2])) {
|
||||
$_REQUEST['a'] = 'wiki';
|
||||
if ($route_args[2] == 'pages') {
|
||||
$_REQUEST['arg'] = 'pages';
|
||||
$_REQUEST['route']['arg'] = true;
|
||||
} else {
|
||||
$_REQUEST['page_name'] = $route_args[2];
|
||||
$_REQUEST['route']['page_name'] = true;
|
||||
}
|
||||
}
|
||||
$_REQUEST['a'] = (isset($_REQUEST['a']) &&
|
||||
$_REQUEST['a'] == 'wiki') ? $_REQUEST['a'] : "groupFeeds";
|
||||
$_REQUEST['route']['c'] = true;
|
||||
$_REQUEST['route']['a'] = true;
|
||||
$end = ($route_args[0] == 'thread') ? "" : "_id";
|
||||
if ($_REQUEST['a'] == 'wiki') {
|
||||
$_REQUEST['group_id'] = $route_args[1];
|
||||
$_REQUEST['route']['group_id'] = true;
|
||||
} else {
|
||||
$just_id = "just_" . $route_args[0] . $end;
|
||||
$_REQUEST[$just_id] = $route_args[1];
|
||||
$_REQUEST['route'][$just_id] = true;
|
||||
}
|
||||
} else if (!isset($route_args[1])) {
|
||||
$_REQUEST['c'] = "group";
|
||||
$_REQUEST['a'] = (isset($_REQUEST['a']) &&
|
||||
$_REQUEST['a'] == 'wiki') ? $_REQUEST['a'] : "groupFeeds";
|
||||
$_REQUEST['route']['c'] = true;
|
||||
$_REQUEST['route']['a'] = true;
|
||||
} else {
|
||||
$handled = false;
|
||||
}
|
||||
return $handled;
|
||||
}
|
||||
/**
|
||||
* Given the type of feed, the identifier of the feed instance, and which
|
||||
* controller is being used creates the url where that feed item can be
|
||||
* accessed from the instance of Yioop. It makes use of the
|
||||
* defined variable REDIRECTS_ON.
|
||||
*
|
||||
* @param string $type of feed: group, user, thread
|
||||
* @param int $id the identifier for that feed.
|
||||
* @param bool $with_delim whether it should be terminated with nothing or
|
||||
* ? or &
|
||||
* @param string $controller which controller is being used to access the
|
||||
* feed: usuall admin or group
|
||||
* @return string url for the page in question
|
||||
*/
|
||||
function feedsUrl($type, $id, $with_delim = false, $controller = "group")
|
||||
{
|
||||
if (C\REDIRECTS_ON && $controller == 'group') {
|
||||
$delim = ($with_delim) ? "?" : "";
|
||||
$path = ($type == "") ? "group" : "$type/$id";
|
||||
return C\BASE_URL ."$path$delim";
|
||||
} else {
|
||||
$delim = ($with_delim) ? "&" : "";
|
||||
$begin = (C\REDIRECTS_ON && $controller == "admin") ?
|
||||
"admin?" : "?c=$controller&";
|
||||
$query = "{$begin}a=groupFeeds";
|
||||
$end = ($type == 'thread') ? "" : "_id";
|
||||
if ($type != "") {
|
||||
if ($begin == "admin?" && $type == "group") {
|
||||
$query = "admin/$id";
|
||||
$delim = "?";
|
||||
} else {
|
||||
$query .= "&just_{$type}$end=$id";
|
||||
}
|
||||
}
|
||||
return C\BASE_URL . "$query$delim";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to route requests for the more and tools link on the landing page.
|
||||
* If redirects on, then /more routes to this more tools page.
|
||||
*
|
||||
* @param array $route_args of url parts (split on slash).
|
||||
* @return bool whether was able to compute a route or not
|
||||
*/
|
||||
function routeMore($route_args)
|
||||
{
|
||||
$_REQUEST['c'] = "search";
|
||||
$_REQUEST['a'] = "more";
|
||||
$_REQUEST['route']['c'] = true;
|
||||
$_REQUEST['route']['a'] = true;
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Return the url for the more and tools link on the landing page making use of
|
||||
* the defined variable REDIRECTS_ON.
|
||||
*
|
||||
* @param bool $with_delim whether it should be terminated with nothing or
|
||||
* ? or &
|
||||
* @return string url for the page in question
|
||||
*/
|
||||
function moreUrl($with_delim = false)
|
||||
{
|
||||
if (C\REDIRECTS_ON) {
|
||||
$delim = ($with_delim) ? "?" : "";
|
||||
return C\BASE_URL ."more$delim";
|
||||
} else {
|
||||
$delim = ($with_delim) ? "&" : "";
|
||||
return C\BASE_URL . "?a=more$delim";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to route page requests to end-user controllers such as
|
||||
* settings, register, admin. urls ending with /controller_name will
|
||||
* be routed to that controller.
|
||||
*
|
||||
* @param array $route_args of url parts (split on slash).
|
||||
* @return bool whether was able to compute a route or not
|
||||
*/
|
||||
function routeController($route_args)
|
||||
{
|
||||
$_REQUEST['c'] = $route_args[0];
|
||||
$_REQUEST['route']['c'] = true;
|
||||
if (isset($route_args[1]) && intval($route_args[1]) == $route_args[1]) {
|
||||
if (isset($_REQUEST['a']) && $_REQUEST['a'] == 'wiki') {
|
||||
$_REQUEST['group_id'] = $route_args[1];
|
||||
} else if (!empty($route_args[2])) {
|
||||
$_REQUEST['a'] = 'wiki';
|
||||
$_REQUEST['group_id'] = $route_args[1];
|
||||
if ($route_args[2] == 'pages') {
|
||||
$_REQUEST['arg'] = 'pages';
|
||||
$_REQUEST['route']['arg'] = true;
|
||||
} else {
|
||||
$_REQUEST['page_name'] = $route_args[2];
|
||||
$_REQUEST['route']['page_name'] = true;
|
||||
}
|
||||
$_REQUEST['route']['page_name'] = true;
|
||||
$_REQUEST['route']['a'] = true;
|
||||
} else {
|
||||
$_REQUEST['a'] = 'groupFeeds';
|
||||
$_REQUEST['just_group_id'] = $route_args[1];
|
||||
}
|
||||
$_REQUEST['route']['group_id'] = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Given the name of a controller for which an easy end-user link is useful
|
||||
* creates the url where it can be accessed on this instance of Yioop,
|
||||
* making use of the defined variable REDIRECTS_ON. Examples of end-user
|
||||
* controllers would be the settings, admin, and register controllers.
|
||||
*
|
||||
* @param string $name of controller
|
||||
* @param bool $with_delim whether it should be terminated with nothing or
|
||||
* ? or &
|
||||
* @return string url for the page in question
|
||||
*/
|
||||
function controllerUrl($name, $with_delim = false)
|
||||
{
|
||||
if (C\REDIRECTS_ON) {
|
||||
$delim = ($with_delim) ? "?" : "";
|
||||
$_REQUEST['route']['c'] = true;
|
||||
return C\BASE_URL . $name . $delim;
|
||||
} else {
|
||||
$delim = ($with_delim) ? "&" : "";
|
||||
return C\BASE_URL . "?c=$name$delim";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to route page requests for subsearches such as news, video, and images
|
||||
* (site owner can define other). Urls of the form /s/subsearch will
|
||||
* go the page handling the subsearch.
|
||||
*
|
||||
* @param array $route_args of url parts (split on slash).
|
||||
* @return bool whether was able to compute a route or not
|
||||
*/
|
||||
function routeSubsearch($route_args)
|
||||
{
|
||||
$handled = true;
|
||||
if (isset($route_args[1])) {
|
||||
$_REQUEST['route']['c'] = true;
|
||||
$_REQUEST['route']['s'] = true;
|
||||
$_REQUEST['c'] = "search";
|
||||
$_REQUEST['s'] = $route_args[1];
|
||||
} else {
|
||||
$handled = false;
|
||||
}
|
||||
return $handled;
|
||||
}
|
||||
/**
|
||||
* Given the name of a subsearch creates the url where it can be accessed
|
||||
* on this instance of Yioop, making use of the defined variable REDIRECTS_ON.
|
||||
* Examples of subsearches include news, video, and images. A site owner
|
||||
* can add to these and delete from these.
|
||||
*
|
||||
* @param string $name of subsearch
|
||||
* @param bool $with_delim whether it should be terminated with nothing or
|
||||
* ? or &
|
||||
* @return string url for the page in question
|
||||
*/
|
||||
function subsearchUrl($name, $with_delim = false)
|
||||
{
|
||||
if (C\REDIRECTS_ON) {
|
||||
$delim = ($with_delim) ? "?" : "";
|
||||
return C\BASE_URL ."s/$name$delim";
|
||||
} else {
|
||||
$delim = ($with_delim) ? "&" : "";
|
||||
return C\BASE_URL . "?s=$name$delim";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to route requests for the suggest-a-url link on the tools page.
|
||||
* If redirects on, then /suugest routes to this suggest-a-url page.
|
||||
*
|
||||
* @param array $route_args of url parts (split on slash).
|
||||
* @return bool whether was able to compute a route or not
|
||||
*/
|
||||
function routeSuggest($route_args)
|
||||
{
|
||||
$_REQUEST['c'] = "register";
|
||||
$_REQUEST['a'] = "suggestUrl";
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Return the url for the suggest-a-url link on the more tools page, making use
|
||||
* of the defined variable REDIRECTS_ON.
|
||||
*
|
||||
* @param bool $with_delim whether it should be terminated with nothing or
|
||||
* ? or &
|
||||
* @return string url for the page in question
|
||||
*/
|
||||
function suggestUrl($with_delim = false)
|
||||
{
|
||||
if (C\REDIRECTS_ON) {
|
||||
$_REQUEST['route']['c'] = true;
|
||||
$_REQUEST['route']['a'] = true;
|
||||
$delim = ($with_delim) ? "?" : "";
|
||||
return C\BASE_URL ."suggest$delim";
|
||||
} else {
|
||||
$delim = ($with_delim) ? "&" : "";
|
||||
return C\BASE_URL . "?c=register&a=suggestUrl$delim";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to route page requests for pages corresponding to a wiki page of
|
||||
* group. If it is a wiki page for the public group viewed without being
|
||||
* logged in, the route might come in as yioop_instance/p/page_name if
|
||||
* redirects are on. If it is for a non-public wiki or page accessed with
|
||||
* logged in the url will look like either:
|
||||
* yioop_instance/group/group_id?a=wiki&page_name=some_name
|
||||
* or
|
||||
* yioop_instance/admin/group_id?a=wiki&page_name=some_name&csrf_token_string
|
||||
*
|
||||
* @param array $route_args of url parts (split on slash).
|
||||
* @return bool whether was able to compute a route or not
|
||||
*/
|
||||
function routeWiki($route_args)
|
||||
{
|
||||
$handled = true;
|
||||
if (isset($route_args[1])) {
|
||||
if ($route_args[1] == 'pages') {
|
||||
$_REQUEST['c'] = "group";
|
||||
$_REQUEST['a'] = 'wiki';
|
||||
$_REQUEST['arg'] = 'pages';
|
||||
$_REQUEST['route']['c'] = true;
|
||||
$_REQUEST['route']['a'] = true;
|
||||
$_REQUEST['route']['arg'] = true;
|
||||
} else {
|
||||
$_REQUEST['c'] = "static";
|
||||
$_REQUEST['p'] = $route_args[1];
|
||||
$_REQUEST['route']['c'] = true;
|
||||
$_REQUEST['route']['p'] = true;
|
||||
}
|
||||
} else {
|
||||
$handled = false;
|
||||
}
|
||||
return $handled;
|
||||
}
|
||||
/**
|
||||
* Given the name of a wiki page, the group it belongs to, and which
|
||||
* controller is being used creates the url where that feed item can be
|
||||
* accessed from the instance of Yioop. It makes use of the
|
||||
* defined variable REDIRECTS_ON.
|
||||
*
|
||||
* @param string $name of wiki page
|
||||
* @param bool $with_delim whether it should be terminated with nothing or
|
||||
* ? or &
|
||||
* @param string $controller which controller is being used to access the
|
||||
* feed: usually static (for the public group), admin, or group
|
||||
* @param int $id the group the wiki page belongs to
|
||||
* @return string url for the page in question
|
||||
*/
|
||||
function wikiUrl($name, $with_delim = false, $controller = "static", $id =
|
||||
C\PUBLIC_GROUP_ID)
|
||||
{
|
||||
$q = ($with_delim) ? "?" : "";
|
||||
$a = ($with_delim) ? "&" : "";
|
||||
$is_static = ($controller == "static");
|
||||
if (C\REDIRECTS_ON) {
|
||||
$q = ($with_delim) ? "?" : "";
|
||||
if ($is_static) {
|
||||
if ($name == "") {
|
||||
$name = "Main";
|
||||
}
|
||||
return C\BASE_URL ."p/$name$q";
|
||||
} else {
|
||||
$page = ($name== "") ? "?a=wiki$a" : "/$name$q";
|
||||
return C\BASE_URL .
|
||||
$controller . "/$id$page";
|
||||
}
|
||||
} else {
|
||||
$delim = ($with_delim) ? "&" : "";
|
||||
if ($name == 'pages') {
|
||||
if ($is_static) {
|
||||
$controller = $group;
|
||||
}
|
||||
return C\BASE_URL .
|
||||
"?c=$controller&a=wiki&arg=pages&group_id=$id$a";
|
||||
} else {
|
||||
if ($is_static) {
|
||||
if ($name == "") {
|
||||
$name = "main";
|
||||
}
|
||||
return C\BASE_URL . "?c=static&p=$name$a";
|
||||
} else {
|
||||
$page = ($name== "") ? "" : "&page_name=$name";
|
||||
return C\BASE_URL .
|
||||
"?c=$controller&a=wiki&group_id=$id$page$a";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!defined('seekquarry\\yioop\\configs\\SKIP_BOOTSTRAP')) {
|
||||
bootstrap();
|
||||
}
|
67
src/library/AnalyticsManager.php
Normal file
67
src/library/AnalyticsManager.php
Normal file
|
@ -0,0 +1,67 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
|
||||
/**
|
||||
* Used to set and get SQL query and search query timing statistic
|
||||
* between models and index_bundle_iterators
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class AnalyticsManager
|
||||
{
|
||||
/**
|
||||
* Where get and set field values are stored
|
||||
* @var array
|
||||
*/
|
||||
private static $data = [];
|
||||
/**
|
||||
* Used to get the timing statistic associated with $attribute
|
||||
* @param string $attribute to get statistic for
|
||||
* @return whatever was stored for that statistic
|
||||
*/
|
||||
public static function get($attribute)
|
||||
{
|
||||
return isset(self::$data[$attribute]) ? self::$data[$attribute] : null;
|
||||
}
|
||||
/**
|
||||
* Used to set the timing statistic $value associated with $attribute
|
||||
* @param string $attribute to get statistic for
|
||||
* @param mixed $value whatever timing information is to be associated with
|
||||
* value
|
||||
*/
|
||||
public static function set($attribute, $value)
|
||||
{
|
||||
self::$data[$attribute] = $value;
|
||||
}
|
||||
}
|
779
src/library/BTree.php
Normal file
779
src/library/BTree.php
Normal file
|
@ -0,0 +1,779 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* This class implements the B-Tree data structure for storing int key based
|
||||
* key-value pairs based on the algorithms in Introduction To Algorithms,
|
||||
* by T.H. Cormen, C.E. Leiserson, R.L. Rivest, and C. Stein. Second
|
||||
* Edition, 2001, The MIT Press
|
||||
*
|
||||
* @author Akshat Kukreti
|
||||
*/
|
||||
class BTree
|
||||
{
|
||||
/**
|
||||
* Default value of minimum degree. The minimum degree determines the
|
||||
* minimum and maximum number of keys and child nodes, for nodes
|
||||
* other than root node
|
||||
*/
|
||||
const MIN_DEGREE = 501;
|
||||
/**
|
||||
* Minimum degree of the B-Tree. Used in determining the minimum/maximum
|
||||
* keys and links a B-Tree node may have.
|
||||
* minimum_keys = minimum_degree - 1
|
||||
* minimum_links = minimum_keys + 1
|
||||
* maximum_keys = 2 * minimum_degree - 1
|
||||
* maximum_links = maximum_keys + 1
|
||||
* @var int
|
||||
*/
|
||||
public $min_degree;
|
||||
/**
|
||||
* Storage for root node of the B-Tree
|
||||
* @var object
|
||||
*/
|
||||
public $root;
|
||||
/**
|
||||
* Counter for node Ids
|
||||
* @var int
|
||||
*/
|
||||
public $id_count;
|
||||
/**
|
||||
* Directory for storing the B-Tree files
|
||||
* @var string
|
||||
*/
|
||||
public $dir;
|
||||
/**
|
||||
* Creates/Loads B-Tree having specified directory and minimum_degree. The
|
||||
* default minimum_degree is 501.
|
||||
* @param string $dir is the directory for storing the B-Tree files
|
||||
* @param int $min_degree minimum degree of a B-tree node
|
||||
*/
|
||||
public function __construct($dir, $min_degree = self::MIN_DEGREE)
|
||||
{
|
||||
$this->dir = $dir;
|
||||
$this->min_degree = $min_degree;
|
||||
if (!is_dir($this->dir)) {
|
||||
mkdir($this->dir);
|
||||
@chmod($this->dir, 0777);
|
||||
}
|
||||
$root_file = $this->dir."/root.txt";
|
||||
if (file_exists($root_file)) {
|
||||
$this->root = unserialize(file_get_contents($root_file));
|
||||
$this->id_count = unserialize(file_get_contents($this->dir.
|
||||
"/count.txt"));
|
||||
} else {
|
||||
$this->root = new BTNode();
|
||||
$this->root->id = "root";
|
||||
$this->id_count = 1;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Reads node from file saved on disk
|
||||
* @param int $id is the Id of the node to be read
|
||||
* @return object $node is the node
|
||||
*/
|
||||
public function readNode($id)
|
||||
{
|
||||
$node_file = $this->dir."/$id.txt";
|
||||
if (file_exists($node_file)) {
|
||||
$node = unserialize(file_get_contents($node_file));
|
||||
return $node;
|
||||
} else {
|
||||
crawlLog("Btree could not read node $id from disk");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Writes node to disk
|
||||
* @param object $node is the node to be written to disk
|
||||
*/
|
||||
public function writeNode($node)
|
||||
{
|
||||
$node_file = $this->dir."/{$node->id}.txt";
|
||||
$contents = serialize($node);
|
||||
file_put_contents($node_file, $contents);
|
||||
@chmod($node_file, 0777);
|
||||
}
|
||||
/**
|
||||
* Writes the root node of this btree to disk
|
||||
*/
|
||||
public function writeRoot()
|
||||
{
|
||||
$this->writeNode($this->root);
|
||||
}
|
||||
/**
|
||||
* Deletes file associated with given node from disk
|
||||
* @param int $id is the id of the node whose file is to be deleted
|
||||
*/
|
||||
public function deleteNodeFile($id)
|
||||
{
|
||||
$node_file = $this->dir."/$id.txt";
|
||||
if (file_exists($node_file)) {
|
||||
unlink($node_file);
|
||||
} else {
|
||||
crawlLog("Could not delete node $id from disk");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Saves value of node id counter
|
||||
* @param int $count is the id counter
|
||||
*/
|
||||
public function saveNodeCount()
|
||||
{
|
||||
$count_file = $this->dir."/count.txt";
|
||||
$node_count = serialize($this->id_count);
|
||||
file_put_contents($count_file, $node_count);
|
||||
}
|
||||
/**
|
||||
* Deletes the node id count file
|
||||
*/
|
||||
public function deleteCount()
|
||||
{
|
||||
unlink($this->dir."/count.txt");
|
||||
}
|
||||
/**
|
||||
* Returns key-value pair in the B-Tree based on key
|
||||
* @param int $key is the key for whicht the key-value pair is to be
|
||||
* found
|
||||
* @return array key-value pair associated with $key or null if the
|
||||
* key-value pair is not found in the tree.
|
||||
*/
|
||||
public function findValue($key)
|
||||
{
|
||||
list($node, $flag, $pos) = $this->search($this->root, $key);
|
||||
if ($pos !== null) {
|
||||
if ($flag == 1) {
|
||||
return $node->keys[$pos];
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Searches for key-value pair for a given key in a node. If key value pair
|
||||
* is not found in the node, recursively searches in the root node of the
|
||||
* sub-tree till the pair is found. Search stops at leaf nodes.
|
||||
* @param object $node is the B-Tree node from where the search starts
|
||||
* @param int $key is the key for which the key-value pair is to be
|
||||
* searched
|
||||
*/
|
||||
public function search($node, $key)
|
||||
{
|
||||
$flag = -1;
|
||||
if (empty($node->keys)) {
|
||||
return [$node, $flag, null];
|
||||
} else {
|
||||
list($flag, $pos) = $this->binarySearch($node->keys, $key);
|
||||
if ($flag == 1) {
|
||||
return [$node, $flag, $pos];
|
||||
}
|
||||
if ($node->is_leaf == true) {
|
||||
return [$node, $flag, $pos];
|
||||
} else {
|
||||
$next_id = $node->links[$pos];
|
||||
$next_node = $this->readNode($next_id);
|
||||
return $this->search($next_node, $key);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Inserts a key-value pair in the B-Tree
|
||||
* @param array $pair is the key-value pair to be inserted
|
||||
*/
|
||||
public function insert($pair)
|
||||
{
|
||||
$node = $this->root;
|
||||
if (empty($node->keys)) {
|
||||
$node->keys = [$pair];
|
||||
$node->count = count($node->keys);
|
||||
$this->writeNode($node);
|
||||
$this->saveNodeCount();
|
||||
} else if ($node->count == 2 * $this->min_degree - 1) {
|
||||
$temp = $this->createEmptyParentNode();
|
||||
$this->root = $temp;
|
||||
$this->swap($temp->id, $node->id);
|
||||
$temp->links[0] = $node->id;
|
||||
$this->bTreeSplitChild($temp, 0, $node);
|
||||
$this->insertNodeNotFull($temp, $pair);
|
||||
} else {
|
||||
$this->insertNodeNotFull($node, $pair);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Inserts a key-value pair in a leaf node that is not full. Searches for
|
||||
* the appropriate leaf node, splitting full nodes before descending
|
||||
* down the tree recursively.
|
||||
* @param object $node is the node from where the search for the leaf node
|
||||
* begins
|
||||
* @param array $pair is the key-value pair
|
||||
*/
|
||||
public function insertNodeNotFull($node, $pair)
|
||||
{
|
||||
$key = $pair[0];
|
||||
$i = $node->count - 1;
|
||||
list($flag, $pos) = $this->binarySearch($node->keys, $key);
|
||||
if ($node->is_leaf == true) {
|
||||
if ($flag == 1) {
|
||||
$node->keys[$pos] = $pair;
|
||||
$this->writeNode($node);
|
||||
} else {
|
||||
while($i >= 0 && $node->keys[$i][0] > $key) {
|
||||
$node->keys[$i + 1] = $node->keys[$i];
|
||||
$i -= 1;
|
||||
}
|
||||
$node->keys[$i + 1] = $pair;
|
||||
$node->count = count($node->keys);
|
||||
$this->writeNode($node);
|
||||
}
|
||||
} else {
|
||||
if ($flag == 1) {
|
||||
$node->keys[$pos] = $pair;
|
||||
$this->writeNode($node);
|
||||
} else {
|
||||
while($i >= 0 && $node->keys[$i][0] > $key) {
|
||||
$i -= 1;
|
||||
}
|
||||
$i += 1;
|
||||
$next_node = $this->readNode($node->links[$i]);
|
||||
if ($next_node->count == 2 * $this->min_degree - 1) {
|
||||
$this->bTreeSplitChild($node, $i, $next_node);
|
||||
if ($key > $node->keys[$i][0]) {
|
||||
$i += 1;
|
||||
$next_node = $this->readNode($node->links[$i]);
|
||||
}
|
||||
}
|
||||
$this->insertNodeNotFull($next_node, $pair);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Splits a full node into two child node. The median key-value pair is
|
||||
* added to the parent node of the node being split.
|
||||
*
|
||||
* @param object $parent is the parent node
|
||||
* @param int $i is the link to child node
|
||||
* @param object $child is the child node
|
||||
*/
|
||||
public function bTreeSplitChild($parent, $i, $child)
|
||||
{
|
||||
$this->id_count += 1;
|
||||
$temp = new BTNode();
|
||||
$temp->id = $this->id_count;
|
||||
$this->saveNodeCount();
|
||||
$temp->is_leaf = $child->is_leaf;
|
||||
$temp->count = $this->min_degree - 1;
|
||||
for ($j = 0;$j < $this->min_degree - 1;$j++) {
|
||||
$temp->keys[$j] = $child->keys[$this->min_degree + $j];
|
||||
}
|
||||
if ($child->is_leaf == false) {
|
||||
for ($j = 0;$j < $this->min_degree;$j++) {
|
||||
$temp->links[$j] = $child->links[$this->min_degree + $j];
|
||||
}
|
||||
}
|
||||
for ($j = $parent->count;$j > $i;$j--) {
|
||||
$parent->links[$j + 1] = $parent->links[$j];
|
||||
}
|
||||
$parent->links[$j + 1] = $temp->id;
|
||||
for ($j = $parent->count - 1;$j >= $i;$j--) {
|
||||
$parent->keys[$j + 1] = $parent->keys[$j];
|
||||
}
|
||||
$parent->keys[$j + 1] = $child->keys[$this->min_degree - 1];
|
||||
$parent->count = count($parent->keys);
|
||||
$child->keys = array_slice($child->keys, 0, $this->min_degree - 1);
|
||||
if ($child->is_leaf == false) {
|
||||
$child->links = array_slice($child->links, 0, $this->min_degree);
|
||||
}
|
||||
$child->count = count($child->keys);
|
||||
$this->writeNode($child);
|
||||
$this->writeNode($temp);
|
||||
$this->writeNode($parent);
|
||||
}
|
||||
/**
|
||||
* Swaps value of two variables
|
||||
* @param $x is the first variable
|
||||
* @param $y is the second variable
|
||||
*/
|
||||
public function swap(&$x, &$y)
|
||||
{
|
||||
$temp = $x;
|
||||
$x = $y;
|
||||
$y = $temp;
|
||||
}
|
||||
/**
|
||||
* Creates an empty non-leaf node
|
||||
* @return object $node is the non-leaf node
|
||||
*/
|
||||
public function createEmptyParentNode()
|
||||
{
|
||||
$this->id_count += 1;
|
||||
$temp = new BTNode();
|
||||
$temp->id = $this->id_count;
|
||||
$this->saveNodeCount();
|
||||
$temp->is_leaf = false;
|
||||
return $temp;
|
||||
}
|
||||
/**
|
||||
* Performs binary search for a integer key on an array of integer key based
|
||||
* key-value pairs
|
||||
* @param array $keys is an array containing key-value pairs
|
||||
* @param int $key is the key
|
||||
* @return array containing flag indicating it the value was found or not,
|
||||
* and the position equal to, or nearest to the position of the key being
|
||||
* searched
|
||||
*/
|
||||
public function binarySearch($keys, $key)
|
||||
{
|
||||
$low = 0;
|
||||
$high = count($keys) - 1;
|
||||
$flag = -1;
|
||||
while($high >= $low) {
|
||||
$middle = (int)floor(($high + $low) / 2);
|
||||
if ($key == $keys[$middle][0]) {
|
||||
$flag = 1;
|
||||
return [$flag, $middle];
|
||||
} else if ($key > $keys[$middle][0]) {
|
||||
$low = $middle + 1;
|
||||
} else {
|
||||
$high = $middle - 1;
|
||||
}
|
||||
}
|
||||
return [$flag, $low];
|
||||
}
|
||||
/**
|
||||
* Removes a key-value pair from the B-Tree
|
||||
* @param int $key associated with the key-value pair to be deleted
|
||||
*/
|
||||
public function remove($key)
|
||||
{
|
||||
$this->delete($this->root, $key);
|
||||
}
|
||||
/**
|
||||
* Deletes a key-value pair from the B-Tree from a node.
|
||||
* Handles deletion from leaf node and internal node. If the key-value pair
|
||||
* is not found in an internal node. The recrusion descends to the root
|
||||
* of the sub-tree until a leaf node is encoutered that does not have the
|
||||
* key-value pair to be deleted.
|
||||
* @param object $node is from where the key search starts
|
||||
* @param int $key is the key to be deleted
|
||||
*/
|
||||
public function delete($node, $key)
|
||||
{
|
||||
list($flag, $pos) = $this->binarySearch($node->keys, $key);
|
||||
if ($flag == 1 && $node->is_leaf == false) {
|
||||
$this->reArrange($node, $pos);
|
||||
}
|
||||
list($flag, $pos) = $this->binarySearch($node->keys, $key);
|
||||
if ($flag == 1 && $node->is_leaf == true) {
|
||||
$this->deleteFromLeaf($node, $pos);
|
||||
} else if ($flag == 1 && $node->is_leaf == false) {
|
||||
$this->deleteFromNonLeaf($node, $pos);
|
||||
} else if ($flag !== 1 && $node->is_leaf == false) {
|
||||
$sub_tree_root = $this->getDescendant($node, $pos);
|
||||
$this->delete($sub_tree_root, $key);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Shifts a key from a non-leaf root to it's child node using nodes
|
||||
* preceding and next to the key-value pair to be deleted. If the
|
||||
* preceding child node has atleast minimum MIN_DEGREE keys, a the last
|
||||
* key-value pair from the preceding node is moved to the position of the
|
||||
* key-value pair that is to be deleted. Otherwise the same process is done
|
||||
* using the first key-value pair of the child node next to the key-value
|
||||
* pair to be deleted.
|
||||
* @param object $node is the internal node containing the key-value pair to
|
||||
* be deleted
|
||||
* @param int $pos is the position of the key-value pair within $pos.
|
||||
*/
|
||||
public function reArrange(&$node, $pos)
|
||||
{
|
||||
$pred_id = $node->links[$pos];
|
||||
$pred = $this->readNode($pred_id);
|
||||
$next_id = $node->links[$pos + 1];
|
||||
$next = $this->readNode($next_id);
|
||||
if ($pred->count >= $this->min_degree) {
|
||||
$this->adjustChildUsingLeftSiblingAndParent($node, $next, $pred,
|
||||
$pos + 1);
|
||||
} else if ($next->count >= $this->min_degree) {
|
||||
$this->adjustChildUsingRightSiblingAndParent($node, $pred,
|
||||
$next, $pos);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Deletes key-value pair from a leaf node in a B-Tree
|
||||
* @param object& $node is the leaf node containing the key-value pair
|
||||
* @param int $pos in node to delete
|
||||
*/
|
||||
public function deleteFromLeaf(&$node, $pos)
|
||||
{
|
||||
if ($pos == $node->count - 1) {
|
||||
array_pop($node->keys);
|
||||
$node->count -= 1;
|
||||
$this->writeNode($node);
|
||||
} else {
|
||||
for ($i = $pos + 1; $i < $node->count; $i++) {
|
||||
$node->keys[$i - 1] = $node->keys[$i];
|
||||
}
|
||||
$node->keys = array_slice($node->keys, 0, $node->count - 1);
|
||||
$node->count -= 1;
|
||||
$this->writeNode($node);
|
||||
}
|
||||
if ($node == $this->root && $node->count == 0) {
|
||||
$this->deleteNodeFile("root");
|
||||
$this->deleteCount();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Deletes key-value pair from a non-leaf node in a B-Tree
|
||||
* @param object& $node is the non-leaf node containing the key-value pair
|
||||
* @param int $pos link position in node to delete
|
||||
*/
|
||||
public function deleteFromNonLeaf(&$node, $pos)
|
||||
{
|
||||
$pred_id = $node->links[$pos];
|
||||
$pred = $this->readNode($pred_id);
|
||||
if ($pred->count >= $this->min_degree) {
|
||||
$pred_pair = $pred->keys[$pred->count - 1];
|
||||
$pred_key = $pred_pair[0];
|
||||
$this->delete($pred, $pred_key);
|
||||
$node->keys[$pos] = $pred_pair;
|
||||
$this->writeNode($node);
|
||||
} else {
|
||||
$next_id = $node->links[$pos + 1];
|
||||
$next = $this->readNode($next_id);
|
||||
if ($next->count >= $this->min_degree) {
|
||||
$next_pair = $next->keys[0];
|
||||
$next_key = $next_pair[0];
|
||||
$this->delete($next, $next_key);
|
||||
$node->keys[$pos] = $next_pair;
|
||||
$this->writeNode($node);
|
||||
} else {
|
||||
$node_pair = $node->keys[$pos];
|
||||
$node_key = $node_pair[0];
|
||||
$pred->keys[$pred->count] = $node_pair;
|
||||
$pred->count += 1;
|
||||
if ($pos == $node->count - 1) {
|
||||
array_pop($node->keys);
|
||||
array_pop($node->links);
|
||||
$node->count -= 1;
|
||||
} else {
|
||||
for ($i = $pos + 1;$i < $node->count;$i++) {
|
||||
$node->keys[$i - 1] = $node->keys[$i];
|
||||
}
|
||||
$node->keys = array_slice($node->keys, 0, $node->count - 1);
|
||||
for ($i = $pos + 2;$i <= $node->count;$i++) {
|
||||
$node->links[$i - 1] = $node->links[$i];
|
||||
}
|
||||
$node->links = array_slice($node->links, 0, $node->count);
|
||||
$node->count -= 1;
|
||||
}
|
||||
for ($i = 0;$i < $next->count;$i++) {
|
||||
$pred->keys[$pred->count + $i] = $next->keys[$i];
|
||||
}
|
||||
if ($next->is_leaf == false) {
|
||||
for ($i = 0;$i <= $next->count;$i++) {
|
||||
$pred->links[$pred->count + $i] = $next->links[$i];
|
||||
}
|
||||
}
|
||||
$pred->count += $next->count;
|
||||
$this->writeNode($pred);
|
||||
$this->deleteNodeFile($next->id);
|
||||
if ($node == $this->root && $node->count == 0) {
|
||||
$old_id = $pred->id;
|
||||
$pred->id = "root";
|
||||
$this->root = $pred;
|
||||
$this->deleteNodeFile($old_id);
|
||||
$this->writeNode($this->root);
|
||||
} else {
|
||||
$this->writeNode($node);
|
||||
}
|
||||
$this->delete($pred, $node_key);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* If the key to be deleted is not found in an internal node, finds the root
|
||||
* of the sub-tree that might contain the key to be deleted. If the node
|
||||
* contains atleast $min_degree number of keys, the node is returned.
|
||||
* Otherwise, the node is adjusted using one of its sibling nodes and the
|
||||
* parent node so that the resultant node has $min_degree keys.
|
||||
* @param object $parent is the parent node
|
||||
* @param int $pos is the link to the root of the sub-tree
|
||||
* @return object $child is the child node to which the recursion will
|
||||
* descend
|
||||
*/
|
||||
public function getDescendant($parent, $pos)
|
||||
{
|
||||
$child_id = $parent->links[$pos];
|
||||
$child = $this->readNode($child_id);
|
||||
if ($child->count == $this->min_degree - 1) {
|
||||
$siblings = $this->getSiblings($parent, $pos);
|
||||
if ($siblings[0] !== -1 && $siblings[1] !== -1) {
|
||||
$pred_id = $siblings[0];
|
||||
$pred = $this->readNode($pred_id);
|
||||
if ($pred->count >= $this->min_degree) {
|
||||
$this->adjustChildUsingLeftSiblingAndParent($parent, $child,
|
||||
$pred, $pos);
|
||||
return $child;
|
||||
} else {
|
||||
$next_id = $siblings[1];
|
||||
$next = $this->readNode($next_id);
|
||||
if ($next->count >= $this->min_degree) {
|
||||
$this->adjustChildUsingRightSiblingAndParent($parent,
|
||||
$child, $next, $pos);
|
||||
return $child;
|
||||
} else {
|
||||
if ($pred->count <= $next->count) {
|
||||
$this->mergeChildWithParentKeyAndRightSibling(
|
||||
$parent, $pred, $child, $pos - 1);
|
||||
return $pred;
|
||||
} else {
|
||||
$this->mergeChildWithParentKeyAndRightSibling(
|
||||
$parent, $child, $next, $pos);
|
||||
return $child;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if ($siblings[0] !== -1) {
|
||||
$pred_id = $siblings[0];
|
||||
$pred = $this->readNode($pred_id);
|
||||
if ($pred->count >= $this->min_degree) {
|
||||
$this->adjustChildUsingLeftSiblingAndParent($parent, $child,
|
||||
$pred, $pos);
|
||||
return $child;
|
||||
} else {
|
||||
$this->mergeChildWithParentKeyAndRightSibling($parent,
|
||||
$pred, $child, $pos - 1);
|
||||
return $pred;
|
||||
}
|
||||
} else {
|
||||
$next_id = $siblings[1];
|
||||
$next = $this->readNode($next_id);
|
||||
if ($next->count >= $this->min_degree) {
|
||||
$this->adjustChildUsingRightSiblingAndParent($parent,
|
||||
$child, $next, $pos);
|
||||
return $child;
|
||||
} else {
|
||||
$this->mergeChildWithParentKeyAndRightSibling($parent,
|
||||
$child, $next, $pos);
|
||||
return $child;
|
||||
}
|
||||
}
|
||||
} else return $child;
|
||||
}
|
||||
/**
|
||||
* Gives a child node an extra key by moving a key from the parent to the
|
||||
* child node, and by moving a key from the child's left sibling to the
|
||||
* parent node
|
||||
* @param object $parent is the parent node
|
||||
* @param object $child is the child node
|
||||
* @param object $pred is the $child's left sibling node
|
||||
* @param $pos is the link from $parent to $child
|
||||
*/
|
||||
public function adjustChildUsingLeftSiblingAndParent(&$parent, &$child,
|
||||
&$pred, $pos)
|
||||
{
|
||||
$pred_pair = array_pop($pred->keys);
|
||||
$pred_link = -1;
|
||||
if ($pred->is_leaf == false) {
|
||||
$pred_link = array_pop($pred->links);
|
||||
}
|
||||
$pred->count -= 1;
|
||||
$this->writeNode($pred);
|
||||
$parent_pair = $parent->keys[$pos - 1];
|
||||
for ($i = $child->count - 1;$i >= 0;$i--) {
|
||||
$child->keys[$i + 1] = $child->keys[$i];
|
||||
}
|
||||
$child->keys[0] = $parent_pair;
|
||||
if ($child->is_leaf == false) {
|
||||
for ($i = $child->count;$i >= 0;$i--) {
|
||||
$child->links[$i + 1] = $child->links[$i];
|
||||
}
|
||||
$child->links[0] = $pred_link;
|
||||
}
|
||||
$child->count += 1;
|
||||
$this->writeNode($child);
|
||||
$parent->keys[$pos - 1] = $pred_pair;
|
||||
$this->writeNode($parent);
|
||||
}
|
||||
/**
|
||||
* Gives a child node an extra key by moving a key from the parent to the
|
||||
* child node, and by moving a key from the child's right sibling to the
|
||||
* parent node
|
||||
* @param object& $parent is the parent node
|
||||
* @param object& $child is the child node
|
||||
* @param object& $next is the $child's right sibling node
|
||||
* @param int $pos is the link from $parent to $child
|
||||
*/
|
||||
public function adjustChildUsingRightSiblingAndParent(&$parent, &$child,
|
||||
&$next, $pos)
|
||||
{
|
||||
$next_pair = $next->keys[0];
|
||||
$next_link = -1;
|
||||
for ($i = 1;$i < $next->count;$i++) {
|
||||
$next->keys[$i - 1] = $next->keys[$i];
|
||||
}
|
||||
$next->keys = array_slice($next->keys, 0, $next->count - 1);
|
||||
if ($next->is_leaf == false) {
|
||||
$next_link = $next->links[0];
|
||||
for ($i = 1;$i <= $next->count;$i++) {
|
||||
$next->links[$i - 1] = $next->links[$i];
|
||||
}
|
||||
$next->links = array_slice($next->links, 0, $next->count);
|
||||
}
|
||||
$next->count -= 1;
|
||||
$this->writeNode($next);
|
||||
$parent_pair = $parent->keys[$pos];
|
||||
$child->keys[$child->count] = $parent_pair;
|
||||
$child->count += 1;
|
||||
if ($child->is_leaf == false) {
|
||||
$child->links[$child->count] = $next_link;
|
||||
}
|
||||
$this->writeNode($child);
|
||||
$parent->keys[$pos] = $next_pair;
|
||||
$this->writeNode($parent);
|
||||
}
|
||||
/**
|
||||
* Merges the child node with it's right sibling. The separating key in the
|
||||
* parent node is added as the median key to the newly formed node
|
||||
* @param object $parent is the parent node
|
||||
* @param object $child is the child node
|
||||
* @param object $next is the $child's right sibling node
|
||||
* @param $pos is the link from $parent to $child
|
||||
*/
|
||||
public function mergeChildWithParentKeyAndRightSibling(&$parent, &$child,
|
||||
&$next, $pos)
|
||||
{
|
||||
$parent_pair = $parent->keys[$pos];
|
||||
$child->keys[$child->count] = $parent_pair;
|
||||
$child->count += 1;
|
||||
for ($i = 0;$i < $next->count;$i++) {
|
||||
$child->keys[$child->count + $i] = $next->keys[$i];
|
||||
}
|
||||
if ($next->is_leaf == false) {
|
||||
for ($i = 0;$i <= $next->count;$i++) {
|
||||
$child->links[$child->count + $i] = $next->links[$i];
|
||||
}
|
||||
}
|
||||
$child->count = count($child->keys);
|
||||
$this->writeNode($child);
|
||||
$this->deleteNodeFile($next->id);
|
||||
if ($pos == $parent->count - 1) {
|
||||
array_pop($parent->keys);
|
||||
array_pop($parent->links);
|
||||
$parent->count -= 1;
|
||||
} else {
|
||||
for ($i = $pos + 1;$i < $parent->count;$i++) {
|
||||
$parent->keys[$i - 1] = $parent->keys[$i];
|
||||
}
|
||||
$parent->keys = array_slice($parent->keys, 0, $parent->count - 1);
|
||||
for ($i = $pos + 2;$i <= $parent->count;$i++) {
|
||||
$parent->links[$i - 1] = $parent->links[$i];
|
||||
}
|
||||
$parent->links = array_slice($parent->links, 0, $parent->count);
|
||||
$parent->count -= 1;
|
||||
}
|
||||
if ($parent == $this->root && $parent->count == 0) {
|
||||
$old_id = $child->id;
|
||||
$child->id = "root";
|
||||
$this->root = $child;
|
||||
$this->deleteNodeFile($old_id);
|
||||
$this->writeNode($this->root);
|
||||
} else {
|
||||
$this->writeNode($parent);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Gets the siblings ids based on link in parent node
|
||||
* @param object $parent is the parent node
|
||||
* @param int $pos is the link for which the siblings are to be found
|
||||
*/
|
||||
public function getSiblings($parent, $pos)
|
||||
{
|
||||
$siblings = [];
|
||||
if ($pos > 0 && $pos < $parent->count) {
|
||||
$siblings[] = $parent->links[$pos - 1];
|
||||
$siblings[] = $parent->links[$pos + 1];
|
||||
} else if ($pos == 0) {
|
||||
$siblings[] = -1;
|
||||
$siblings[] = $parent->links[$pos + 1];
|
||||
} else {
|
||||
$siblings[] = $parent->links[$pos - 1];
|
||||
$siblings[] = -1;
|
||||
}
|
||||
return $siblings;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Class for B-Tree nodes
|
||||
*/
|
||||
class BTNode
|
||||
{
|
||||
/**
|
||||
* Storage for id of a B-Tree node
|
||||
* @var int
|
||||
*/
|
||||
public $id;
|
||||
/**
|
||||
* Flag for checking if node is a leaf node or internal node
|
||||
* @var boolean
|
||||
*/
|
||||
public $is_leaf;
|
||||
/**
|
||||
* Storage for keeping track of node ids
|
||||
* @var int
|
||||
*/
|
||||
public $count;
|
||||
/**
|
||||
* Storage for key-value pairs in a B-Tree node
|
||||
* @var array
|
||||
*/
|
||||
public $keys;
|
||||
/**
|
||||
* Storage for links to child nodes in a B-Tree node
|
||||
* @var array
|
||||
*/
|
||||
public $links;
|
||||
/**
|
||||
* Creates and initializes an empty leaf node with id -1
|
||||
* @var int
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->id = -1;
|
||||
$this->is_leaf = true;
|
||||
$this->count = 0;
|
||||
$this->keys = null;
|
||||
$this->links = null;
|
||||
}
|
||||
}
|
235
src/library/BloomFilterBundle.php
Normal file
235
src/library/BloomFilterBundle.php
Normal file
|
@ -0,0 +1,235 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
*
|
||||
* A BloomFilterBundle is a directory of BloomFilterFile.
|
||||
* The filter bundle, like a Bloom filter, also acts as a set,
|
||||
* but once the active filter in it fills up a new filter is
|
||||
* added to the bundle so that more data can be stored.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
* @see BloomFilterFile
|
||||
*/
|
||||
class BloomFilterBundle
|
||||
{
|
||||
/**
|
||||
* Reference to the filter which will be used to store new data
|
||||
* @var object
|
||||
*/
|
||||
public $current_filter;
|
||||
/**
|
||||
* Total number of filter that this filter bundle currently has
|
||||
* @var int
|
||||
*/
|
||||
public $num_filters;
|
||||
/**
|
||||
* The number of items which have been stored in the current filter
|
||||
* @var int
|
||||
*/
|
||||
public $current_filter_count;
|
||||
/**
|
||||
* The maximum capacity of a filter in this filter bundle
|
||||
* @var int
|
||||
*/
|
||||
public $filter_size;
|
||||
/**
|
||||
* The folder name of this filter bundle
|
||||
* @var string
|
||||
*/
|
||||
public $dir_name;
|
||||
/**
|
||||
* The default maximum size of a filter in a filter bundle
|
||||
*/
|
||||
const default_filter_size = 10000000;
|
||||
/**
|
||||
* Creates or loads if already exists the directory structure and
|
||||
* BloomFilterFiles used by this bundle
|
||||
*
|
||||
* @param $dir_name directory when this bundles data is stored
|
||||
* @param $filter_size the size of an individual filter in this bundle
|
||||
* once a filter is filled a new one is added to the directory
|
||||
*/
|
||||
public function __construct($dir_name,
|
||||
$filter_size = self::default_filter_size )
|
||||
{
|
||||
$this->dir_name = $dir_name;
|
||||
if (!is_dir($dir_name)) {
|
||||
mkdir($dir_name);
|
||||
}
|
||||
$this->loadMetaData();
|
||||
if ($this->num_filters == 0) {
|
||||
$this->current_filter =
|
||||
new BloomFilterFile($dir_name."/filter_0.ftr", $filter_size);
|
||||
$this->num_filters++;
|
||||
$this->filter_size = $filter_size;
|
||||
$this->current_filter->save();
|
||||
$this->saveMetaData();
|
||||
} else {
|
||||
$last_filter = $this->num_filters - 1;
|
||||
$this->current_filter =
|
||||
BloomFilterFile::load($dir_name."/filter_$last_filter.ftr");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Inserts a $value into the BloomFilterBundle
|
||||
*
|
||||
* This involves inserting into the current filter, if the filter
|
||||
* is full, a new filter is added before the value is added
|
||||
*
|
||||
* @param string $value a item to add to the filter bundle
|
||||
*/
|
||||
public function add($value)
|
||||
{
|
||||
if ($this->current_filter_count >= $this->filter_size) {
|
||||
$this->current_filter->save();
|
||||
$this->current_filter = null;
|
||||
gc_collect_cycles();
|
||||
$last_filter = $this->num_filters;
|
||||
$this->current_filter =
|
||||
new BloomFilterFile($this->dir_name."/filter_$last_filter.ftr",
|
||||
$this->filter_size);
|
||||
$this->current_filter_count = 0;
|
||||
$this->num_filters++;
|
||||
$this->saveMetaData();
|
||||
}
|
||||
$this->current_filter->add($value);
|
||||
$this->current_filter_count++;
|
||||
}
|
||||
/**
|
||||
* Removes from the passed array those elements $elt who either are in
|
||||
* the filter bundle or whose $elt[$field_name] is in the bundle.
|
||||
*
|
||||
* @param array& $arr the array to remove elements from
|
||||
* @param array $field_names if not null an array of field names of $arr
|
||||
* to use to do filtering
|
||||
*/
|
||||
public function differenceFilter(&$arr, $field_names = null)
|
||||
{
|
||||
$incremental_time = microtime(true);
|
||||
$num_filters = $this->num_filters;
|
||||
$count = count($arr);
|
||||
for ($i = 0; $i < $num_filters; $i++) {
|
||||
if ($i == $num_filters - 1) {
|
||||
$tmp_filter = $this->current_filter;
|
||||
} else {
|
||||
$tmp_filter =
|
||||
BloomFilterFile::load($this->dir_name."/filter_$i.ftr");
|
||||
}
|
||||
|
||||
for ($j = 0; $j < $count; $j++) {
|
||||
if ($field_names === null) {
|
||||
$tmp = & $arr[$j];
|
||||
if ($tmp !== false && $tmp_filter->contains($tmp)) {
|
||||
/*
|
||||
We deliberately don't try to add anything that has
|
||||
the hash field set to false. This is our cue to
|
||||
skip an element such as a link document which we
|
||||
know will almost always be unique and so be unnecessary
|
||||
to de-duplicate
|
||||
*/
|
||||
unset($arr[$j]);
|
||||
}
|
||||
} else { //now do the same strategy for the array of fields case
|
||||
foreach ($field_names as $field_name) {
|
||||
$tmp = & $arr[$j][$field_name];
|
||||
if ($tmp !== false && $tmp_filter->contains($tmp)) {
|
||||
unset($arr[$j]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (changeInMicrotime($incremental_time) > 30 ) {
|
||||
crawlLog("..Processing item $j of $count from filter ".
|
||||
"number $i of $num_filters.");
|
||||
$incremental_time = microtime(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Loads from the filter bundles' meta.txt the meta data associated with
|
||||
* this filter bundle and stores this data into field variables
|
||||
*/
|
||||
public function loadMetaData()
|
||||
{
|
||||
if (file_exists($this->dir_name.'/meta.txt')) {
|
||||
$meta = unserialize(
|
||||
file_get_contents($this->dir_name.'/meta.txt') );
|
||||
$this->num_filters = $meta['NUM_FILTERS'];
|
||||
$this->current_filter_count = $meta['CURRENT_FILTER_COUNT'];
|
||||
$this->filter_size = $meta['FILTER_SIZE'];
|
||||
} else {
|
||||
$this->num_filters = 0;
|
||||
$this->current_filter_count = 0;
|
||||
$this->filter_size = self::default_filter_size;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Saves the meta data (number of filter, number of items stored, and size)
|
||||
* of the bundle
|
||||
*/
|
||||
public function saveMetaData()
|
||||
{
|
||||
$meta = [];
|
||||
$meta['NUM_FILTERS'] = $this->num_filters;
|
||||
$meta['CURRENT_FILTER_COUNT' ]= $this->current_filter_count;
|
||||
$meta['FILTER_SIZE'] = $this->filter_size;
|
||||
file_put_contents($this->dir_name.'/meta.txt', serialize($meta));
|
||||
}
|
||||
/**
|
||||
* Empties the contents of the bloom filter bundle and resets
|
||||
* it to start storing new data.
|
||||
*/
|
||||
public function reset()
|
||||
{
|
||||
for ($i = 0; $i < $this->num_filters; $i++) {
|
||||
@unlink($this->dir_name."/filter_$i.ftr");
|
||||
}
|
||||
$this->num_filters = 0;
|
||||
$this->current_filter_count = 0;
|
||||
$this->current_filter =
|
||||
new BloomFilterFile($this->dir_name."/filter_0.ftr",
|
||||
$this->filter_size);
|
||||
$this->num_filters++;
|
||||
$this->current_filter->save();
|
||||
$this->saveMetaData();
|
||||
}
|
||||
/**
|
||||
* Used to save to disk all the file data associated with this bundle
|
||||
*/
|
||||
public function forceSave()
|
||||
{
|
||||
$this->saveMetaData();
|
||||
$this->current_filter->save();
|
||||
}
|
||||
}
|
167
src/library/BloomFilterFile.php
Normal file
167
src/library/BloomFilterFile.php
Normal file
|
@ -0,0 +1,167 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* For packInt/unpackInt
|
||||
*/
|
||||
require_once __DIR__."/Utility.php";
|
||||
|
||||
/**
|
||||
* Code used to manage a bloom filter in-memory and in file.
|
||||
* A Bloom filter is used to store a set of objects.
|
||||
* It can support inserts into the set and it can also be
|
||||
* used to check membership in the set.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class BloomFilterFile extends PersistentStructure
|
||||
{
|
||||
/**
|
||||
* Number of bit positions in the Bloom filter used to say an item is
|
||||
* in the filter
|
||||
* @var int
|
||||
*/
|
||||
public $num_keys;
|
||||
/**
|
||||
* Size in bits of the packed string array used to store the filter's
|
||||
* contents
|
||||
* @var int
|
||||
*/
|
||||
public $filter_size;
|
||||
/**
|
||||
* Packed string used to store the Bloom filters
|
||||
* @var string
|
||||
*/
|
||||
public $filter;
|
||||
/**
|
||||
* Initializes the fields of the BloomFilter and its base
|
||||
* PersistentStructure.
|
||||
*
|
||||
* @param string $fname name of the file to store the BloomFilter data in
|
||||
* @param int $num_values the maximum number of values that will be stored
|
||||
* in the BloomFilter. Filter will be sized so the odds of a false
|
||||
* positive are roughly one over this value
|
||||
* @param int $save_frequency how often to store the BloomFilter to disk
|
||||
*/
|
||||
public function __construct($fname, $num_values,
|
||||
$save_frequency = self::DEFAULT_SAVE_FREQUENCY)
|
||||
{
|
||||
$log2 = log(2);
|
||||
$this->num_keys = ceil(log($num_values)/$log2);
|
||||
$this->filter_size = ceil( ($this->num_keys) * $num_values/$log2 );
|
||||
$mem_before = memory_get_usage(true);
|
||||
$this->filter = pack("x". ceil(0.125 * $this->filter_size));
|
||||
// 1/8 =.125 = num bits/bytes, want to make things floats
|
||||
$mem = memory_get_usage(true) - $mem_before;
|
||||
parent::__construct($fname, $save_frequency);
|
||||
}
|
||||
/**
|
||||
* Inserts the provided item into the Bloomfilter
|
||||
*
|
||||
* @param string $value item to add to filter
|
||||
*/
|
||||
public function add($value)
|
||||
{
|
||||
$num_keys = $this->num_keys;
|
||||
$pos_array = $this->getHashBitPositionArray($value, $num_keys);
|
||||
for ($i = 0; $i < $num_keys; $i++) {
|
||||
$this->setBit($pos_array[$i]);
|
||||
}
|
||||
$this->checkSave();
|
||||
}
|
||||
/**
|
||||
* Checks if the BloomFilter contains the provided $value
|
||||
*
|
||||
* @param string $value item to check if is in the BloomFilter
|
||||
* @return bool whether $value was in the filter or not
|
||||
*/
|
||||
public function contains($value)
|
||||
{
|
||||
$num_keys = $this->num_keys;
|
||||
$pos_array = $this->getHashBitPositionArray($value, $num_keys);
|
||||
for ($i = 0; $i < $num_keys; $i++) {
|
||||
if (!$this->getBit($pos_array[$i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Hashes $value to a bit position in the BloomFilter
|
||||
*
|
||||
* @param string $value value to map to a bit position in the filter
|
||||
* @param int $num_keys number of bit positions in the Bloom filter
|
||||
* used to say an item isin the filter
|
||||
* @return int the bit position mapped to
|
||||
*/
|
||||
public function getHashBitPositionArray($value, $num_keys)
|
||||
{
|
||||
$offset = ($num_keys >> 2) + 1;
|
||||
$rand_string = "";
|
||||
for ($i = 0 ; $i < $offset; $i++) {
|
||||
$value = md5($value, true);
|
||||
$rand_string .= $value;
|
||||
}
|
||||
$seed = array_values(unpack("N*", $rand_string));
|
||||
$pos_array = [];
|
||||
$size = $this->filter_size >> 1;
|
||||
$less_one = $size - 1;
|
||||
for ($i = 0; $i < $num_keys; $i++) {
|
||||
$pos_array[$i] = ($seed[$i] % $size) + $less_one;
|
||||
}
|
||||
return $pos_array;
|
||||
}
|
||||
/**
|
||||
* Sets to true the ith bit position in the filter.
|
||||
*
|
||||
* @param int $i the position to set to true
|
||||
*/
|
||||
public function setBit($i)
|
||||
{
|
||||
$byte = ($i >> 3);
|
||||
$bit_in_byte = $i - ($byte << 3);
|
||||
$tmp = $this->filter[$byte];
|
||||
$this->filter[$byte] = $tmp | chr(1 << $bit_in_byte);
|
||||
}
|
||||
/**
|
||||
* Looks up the value of the ith bit position in the filter
|
||||
*
|
||||
* @param int $i the position to look up
|
||||
* @return bool the value of the looked up position
|
||||
*/
|
||||
public function getBit($i)
|
||||
{
|
||||
$byte = $i >> 3;
|
||||
$bit_in_byte = $i - ($byte << 3);
|
||||
return ($this->filter[$byte] & chr(1 << $bit_in_byte)) != chr(0);
|
||||
}
|
||||
}
|
80
src/library/BrowserRunner.php
Normal file
80
src/library/BrowserRunner.php
Normal file
|
@ -0,0 +1,80 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
* LICENSE:
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Eswara Rajesh Pinapala epinapala@live.com
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Used to execute browser-based Javascript and browser page rendering from PHP.
|
||||
*
|
||||
* @author Eswara Rajesh Pinapala
|
||||
*/
|
||||
class BrowserRunner
|
||||
{
|
||||
/**
|
||||
* Tests if there is a headless browser (typically Phantom JS) available
|
||||
* before constructing this kind of object. If not, it throws an exceptio
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$version = $this->execute("-v");
|
||||
if (!$version) {
|
||||
throw new \Exception("BrowserRunner currently requires PhantomJS ".
|
||||
"package to run");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Runs a Javascript in the current headless browser instance and
|
||||
* return the results as either a JSON or PHP object.
|
||||
* @param string $script Javascript to run in browser
|
||||
* @param string $decode_json whether to leave result as is or to convert
|
||||
* from JSON to a PHP object
|
||||
*/
|
||||
public function execute($script, $decode_json = false)
|
||||
{
|
||||
$command = C\PHANTOM_JS." " . implode(' ', func_get_args());
|
||||
$shell_result = shell_exec(escapeshellcmd($command));
|
||||
if ($shell_result === null) {
|
||||
return false;
|
||||
}
|
||||
if ($decode_json) {
|
||||
if (substr($shell_result, 0, 1) !== '{') {
|
||||
//return if the result is not a JSON.
|
||||
return $shell_result;
|
||||
} else {
|
||||
//If the result is a JSON, decode JSON into a PHP array.
|
||||
$json = json_decode($shell_result, true);
|
||||
if ($json === null) {
|
||||
return false;
|
||||
}
|
||||
return $json;
|
||||
}
|
||||
} else {
|
||||
return $shell_result;
|
||||
}
|
||||
}
|
||||
}
|
334
src/library/Bzip2BlockIterator.php
Normal file
334
src/library/Bzip2BlockIterator.php
Normal file
|
@ -0,0 +1,334 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Shawn Tice, (docs added by Chris Pollett chris@pollett.org)
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* This class is used to allow one to iterate through a Bzip2 file.
|
||||
* The main advantage of using this class over the built-in bzip is that
|
||||
* it can "remember" where it left off between serializations. So can
|
||||
* continue where left off between web invocations. This is used in
|
||||
* doing archive crawls of wiki dumps to allow the name server picks up where
|
||||
* it left off.
|
||||
*
|
||||
* @author Shawn Tice, (some docs added by Chris Pollett chris@pollett.org)
|
||||
*/
|
||||
class BZip2BlockIterator
|
||||
{
|
||||
/**
|
||||
* File handle for bz2 file
|
||||
* @var resource
|
||||
*/
|
||||
public $fd = null;
|
||||
/**
|
||||
* Byte offset into bz2 file
|
||||
* @var int
|
||||
*/
|
||||
public $file_offset = 0;
|
||||
/**
|
||||
* Since block sizes are not constant used to store sufficiently many
|
||||
* bytes so can properly extract next blocks
|
||||
* @var string
|
||||
*/
|
||||
public $buffer = '';
|
||||
/**
|
||||
* Used to build and store a bz2 block from the file stream
|
||||
* @var string
|
||||
*/
|
||||
public $block = '';
|
||||
/**
|
||||
* Stores the left over bits of a bz2 block
|
||||
* @var int
|
||||
*/
|
||||
public $bits = 0;
|
||||
/**
|
||||
* Store how many left-over bits there are
|
||||
* @var int
|
||||
*/
|
||||
public $num_extra_bits = 0;
|
||||
/**
|
||||
* Lookup table fpr the number of bits by which the magic
|
||||
* number for the next block has been shifted right. Second
|
||||
* components of sub-arrays say whether block header or endmark
|
||||
* @var array
|
||||
*/
|
||||
public static $header_info = [
|
||||
"\x41" => [0, true], "\xa0" => [1, true],
|
||||
"\x50" => [2, true], "\x28" => [3, true],
|
||||
"\x14" => [4, true], "\x8a" => [5, true],
|
||||
"\xc5" => [6, true], "\x62" => [7, true],
|
||||
|
||||
"\x72" => [0, false], "\xb9" => [1, false],
|
||||
"\xdc" => [2, false], "\xee" => [3, false],
|
||||
"\x77" => [4, false], "\xbb" => [5, false],
|
||||
"\x5d" => [6, false], "\x2e" => [7, false]
|
||||
];
|
||||
/** String to tell if file is a bz2 file*/
|
||||
const MAGIC = 'BZh';
|
||||
/** String at the start of each bz2 block */
|
||||
const BLOCK_HEADER = "\x31\x41\x59\x26\x53\x59";
|
||||
/** String at the end of each bz2 block*/
|
||||
const BLOCK_ENDMARK = "\x17\x72\x45\x38\x50\x90";
|
||||
/**
|
||||
* Blocks are NOT byte-aligned, so the block header (and endmark) may show
|
||||
* up shifted right by 0-8 bits in various places throughout the file. This
|
||||
* regular expression matches any of the possible shifts for both the block
|
||||
* header and the block endmark.
|
||||
*/
|
||||
const BLOCK_LEADER_RE = '
|
||||
/
|
||||
\x41\x59\x26\x53\x59 | \xa0\xac\x93\x29\xac | \x50\x56\x49\x94\xd6
|
||||
|\x28\x2b\x24\xca\x6b | \x14\x15\x92\x65\x35 | \x8a\x0a\xc9\x32\x9a
|
||||
|\xc5\x05\x64\x99\x4d | \x62\x82\xb2\x4c\xa6
|
||||
|
||||
|\x72\x45\x38\x50\x90 | \xb9\x22\x9c\x28\x48 | \xdc\x91\x4e\x14\x24
|
||||
|\xee\x48\xa7\x0a\x12 | \x77\x24\x53\x85\x09 | \xbb\x92\x29\xc2\x84
|
||||
|\x5d\xc9\x14\xe1\x42 | \x2e\xe4\x8a\x70\xa1
|
||||
/x';
|
||||
/**
|
||||
* How many bytes to read into buffer from bz2 stream in one go
|
||||
*/
|
||||
const BLOCK_SIZE = 8192;
|
||||
/**
|
||||
* Creates a new iterator of a bz2 file by opening the file, doing a
|
||||
* sanity check and then setting up the initial file_offset to
|
||||
* where the data starts
|
||||
* @param string $path file path of bz2 file
|
||||
*/
|
||||
public function __construct($path)
|
||||
{
|
||||
$this->path = $path;
|
||||
$this->fd = fopen($this->path, 'rb');
|
||||
$this->header = fread($this->fd, 4);
|
||||
if (substr($this->header, 0, 3) != self::MAGIC) {
|
||||
throw new \Exception('Bad bz2 magic number. Not a bz2 file?');
|
||||
}
|
||||
$this->block = fread($this->fd, 6);
|
||||
if ($this->block != self::BLOCK_HEADER) {
|
||||
throw new \Exception('Bad bz2 block header');
|
||||
}
|
||||
$this->file_offset = 10;
|
||||
}
|
||||
/**
|
||||
* Called by unserialize prior to execution
|
||||
*/
|
||||
public function __wakeup()
|
||||
{
|
||||
$this->fd = fopen($this->path, 'rb');
|
||||
fseek($this->fd, $this->file_offset);
|
||||
}
|
||||
/**
|
||||
* Checks whether the current Bzip2 file has reached an end of file
|
||||
* @return bool eof or not
|
||||
*/
|
||||
public function eof()
|
||||
{
|
||||
return feof($this->fd);
|
||||
}
|
||||
/**
|
||||
* Used to close the file associated with this iterator
|
||||
* @return bool whether the file close was successful
|
||||
*/
|
||||
public function close()
|
||||
{
|
||||
return fclose($this->fd);
|
||||
}
|
||||
/**
|
||||
* Extracts the next bz2 block from the bzip2 file this iterator works
|
||||
* on
|
||||
* @param bool $raw if false then decompress the recovered block
|
||||
*/
|
||||
public function nextBlock($raw = false)
|
||||
{
|
||||
$recovered_block = null;
|
||||
while(!feof($this->fd)) {
|
||||
$next_chunk = fread($this->fd, self::BLOCK_SIZE);
|
||||
$this->file_offset += strlen($next_chunk);
|
||||
$this->buffer .= $next_chunk;
|
||||
$match = preg_match( self::BLOCK_LEADER_RE, $this->buffer,
|
||||
$matches, PREG_OFFSET_CAPTURE);
|
||||
if ($match) {
|
||||
/*
|
||||
$pos is the position of the SECOND byte of the magic number
|
||||
(plus some part of the first byte for a non-zero new_shift).
|
||||
*/
|
||||
$pos = $matches[0][1];
|
||||
/*
|
||||
The new_shift is the number of bits by which the magic
|
||||
number for the next block has been shifted right.
|
||||
*/
|
||||
list($new_shift, $is_start) =
|
||||
self::$header_info[$this->buffer[$pos]];
|
||||
/*
|
||||
The new number of extra bits is what's left in a byte after
|
||||
the new shift. For example, if we have 10|001011 as the byte
|
||||
that begins the next block's header, where the vertical bar
|
||||
represents the beginning of the header bits, the new shift
|
||||
is 2, and after we byte-align the new header to the left
|
||||
there will always be 6 extra bits waiting for two bits to
|
||||
form a byte to be added to the next block.
|
||||
*/
|
||||
$new_num_extra_bits = $new_shift == 0 ? 0 : 8 - $new_shift;
|
||||
if ($new_shift == 0) {
|
||||
$tail_bits = $new_bits = 0;
|
||||
$header_end = 5;
|
||||
$new_header = substr($this->buffer, $pos - 1, 6);
|
||||
$new_block = $new_header;
|
||||
} else {
|
||||
$byte = ord($this->buffer[$pos-1]);
|
||||
$tail_bits = $byte & (((0x1 << $new_shift) - 1) <<
|
||||
(8 - $new_shift));
|
||||
$new_bits = ($byte << $new_shift) & 0xff;
|
||||
$header_end = 6;
|
||||
$new_block = '';
|
||||
$new_header = substr($this->buffer, $pos, 6);
|
||||
self::packLeft($new_block, $new_bits, $new_header,
|
||||
$new_num_extra_bits);
|
||||
}
|
||||
// Make sure all six header bytes match.
|
||||
if ($is_start && $new_block != self::BLOCK_HEADER ||
|
||||
!$is_start && $new_block != self::BLOCK_ENDMARK) {
|
||||
$unmatched = substr($this->buffer, 0, $pos + 6);
|
||||
$keep = substr($this->buffer, $pos + 6);
|
||||
self::packLeft($this->block, $this->bits, $unmatched,
|
||||
$this->num_extra_bits);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
Copy and shift the last chunk of bytes from the previous
|
||||
block before adding the block trailer.
|
||||
*/
|
||||
$block_tail = substr($this->buffer, 0, $pos - 1);
|
||||
$this->packLeft($this->block, $this->bits, $block_tail,
|
||||
$this->num_extra_bits);
|
||||
/*
|
||||
We need to combine the non-header tail bits from the most
|
||||
significant end of the last byte before the next block's
|
||||
header with whatever extra bits are left over from shifting
|
||||
the body of the previous block.
|
||||
*/
|
||||
$bits_left = 8 - $this->num_extra_bits;
|
||||
if ($new_shift >= $bits_left) {
|
||||
$this->bits |= ($tail_bits >> $this->num_extra_bits);
|
||||
$this->block .= chr($this->bits);
|
||||
$this->bits = ($tail_bits << $bits_left) & 0xff;
|
||||
$this->num_extra_bits = $new_shift - $bits_left;
|
||||
} else {
|
||||
$this->bits |= ($tail_bits >> $this->num_extra_bits);
|
||||
$this->num_extra_bits = $this->num_extra_bits +
|
||||
$new_shift;
|
||||
}
|
||||
/*
|
||||
The last block is marked by a different header (sqrt(pi)),
|
||||
and a CRC for the entire "file", which is just the CRC for
|
||||
the first block, since there's only one block.
|
||||
*/
|
||||
$trailer = "\x17\x72\x45\x38\x50\x90".
|
||||
substr($this->block, 6, 4);
|
||||
$this->packLeft($this->block, $this->bits, $trailer,
|
||||
$this->num_extra_bits);
|
||||
if ($this->num_extra_bits != 0) {
|
||||
$this->block .= chr($this->bits);
|
||||
}
|
||||
$recovered_block = $this->header.$this->block;
|
||||
$this->block = $new_block;
|
||||
/*
|
||||
Keep everything after the end of the header for the next
|
||||
block in the buffer.
|
||||
*/
|
||||
$this->buffer = substr($this->buffer, $pos + $header_end);
|
||||
$this->bits = $new_bits;
|
||||
$this->num_extra_bits = $new_num_extra_bits;
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
No match, but we may have just missed a header by a byte, so
|
||||
we need to keep the last six bytes in the buffer so that we
|
||||
have a chance to get the full header on the next round.
|
||||
*/
|
||||
$unmatched = substr($this->buffer, 0, -6);
|
||||
$this->packLeft($this->block, $this->bits, $unmatched,
|
||||
$this->num_extra_bits);
|
||||
$this->buffer = substr($this->buffer, -6);
|
||||
}
|
||||
}
|
||||
if (!$raw) {
|
||||
return bzdecompress($recovered_block);
|
||||
} else {
|
||||
return $recovered_block;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Computes a new bzip2 block portions and bits left over after adding
|
||||
* $bytes to the passed $block.
|
||||
*
|
||||
* @param string& $block the block to add to
|
||||
* @param int& $bits used to hold bits left over
|
||||
* @param string $bytes what to add to the bzip block
|
||||
* @param int $num_extra_bits how many extra bits there are
|
||||
*/
|
||||
public function packLeft(&$block, &$bits, $bytes, $num_extra_bits)
|
||||
{
|
||||
if ($num_extra_bits == 0) {
|
||||
$block .= $bytes;
|
||||
return;
|
||||
}
|
||||
$num_bytes = strlen($bytes);
|
||||
for ($i = 0; $i < $num_bytes; $i++) {
|
||||
$byte = ord($bytes[$i]);
|
||||
$bits |= ($byte >> $num_extra_bits);
|
||||
$block .= chr($bits);
|
||||
$bits = ($byte << (8 - $num_extra_bits)) & 0xff;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!function_exists("main") && php_sapi_name() == 'cli') {
|
||||
/**
|
||||
* Command-line shell for testing the class
|
||||
*/
|
||||
function main()
|
||||
{
|
||||
global $argv;
|
||||
$path = $argv[1];
|
||||
$prefix = isset($argv[2]) ? $argv[2] : 'rec';
|
||||
$itr = new BZip2BlockIterator($path);
|
||||
$i = 1;
|
||||
while(($block = $itr->next_block(true)) !== null) {
|
||||
$rec_name = sprintf("%s%05d.bz2", $prefix, $i);
|
||||
file_put_contents($rec_name, $block);
|
||||
echo "Recovered block {$i}\n";
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
// Only run main if this script is called directly from the command line.
|
||||
if (isset($argv[0]) && realpath($argv[0]) == __FILE__) {
|
||||
main();
|
||||
}
|
||||
}
|
238
src/library/CrawlConstants.php
Normal file
238
src/library/CrawlConstants.php
Normal file
|
@ -0,0 +1,238 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* Shared constants and enums used by components that are involved in the
|
||||
* crawling process
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
interface CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Used to say what kind of queue_server this is
|
||||
*/
|
||||
const BOTH = "IndexerAndScheduler";
|
||||
/**
|
||||
* Used to say what kind of queue_server this is
|
||||
*/
|
||||
const INDEXER = "Indexer";
|
||||
/**
|
||||
* Used to say what kind of queue_server this is
|
||||
*/
|
||||
const SCHEDULER = "Scheduler";
|
||||
const queue_base_name = "QueueBundle";
|
||||
const archive_base_name = "Archive";
|
||||
const name_archive_iterator = "NameArchiveIterator";
|
||||
const fetch_archive_iterator = "FetchArchiveIterator";
|
||||
const save_point = "SavePoint";
|
||||
const schedule_data_base_name = "ScheduleData";
|
||||
const schedule_name = "FetchSchedule";
|
||||
const robot_data_base_name = "RobotData";
|
||||
const etag_expires_data_base_name = "EtagExpiresData";
|
||||
const index_data_base_name = "IndexData";
|
||||
const network_base_name = "Network";
|
||||
const network_crawllist_base_name = "NetworkCrawlList";
|
||||
const statistics_base_name = "Statistics";
|
||||
const index_closed_name = "IndexClosed";
|
||||
const fetch_batch_name = "FetchBatch";
|
||||
const fetch_crawl_info = "FetchInfo";
|
||||
const fetch_closed_name = "FetchClosed";
|
||||
const data_base_name = "At";
|
||||
const schedule_start_name = "StartCrawlSchedule.txt";
|
||||
const robot_table_name = "robot_table.txt";
|
||||
const mirror_table_name = "mirror_table.txt";
|
||||
/** Used in priority queue*/
|
||||
const MAX = 1;
|
||||
const MIN = -1;
|
||||
/** starts of daemon processes*/
|
||||
const STOP_STATE = -1;
|
||||
const CONTINUE_STATE = 1;
|
||||
const NO_DATA_STATE = 2;
|
||||
const WAITING_START_MESSAGE_STATE = 3;
|
||||
const REDO_STATE = 4;
|
||||
const STATUS = 'a';
|
||||
const CRAWL_TIME = 'b';
|
||||
const HTTP_CODE = 'c';
|
||||
const TIMESTAMP = 'd';
|
||||
const TYPE = 'e';
|
||||
const ENCODING = 'f';
|
||||
const SEEN_URLS = 'g';
|
||||
const MACHINE = 'h';
|
||||
const INVERTED_INDEX = 'i';
|
||||
const SAVED_CRAWL_TIMES= 'j';
|
||||
const SCHEDULE_TIME = 'k';
|
||||
const URL = 'l';
|
||||
const WEIGHT = 'm';
|
||||
const ROBOT_PATHS = 'n';
|
||||
const HASH = 'o';
|
||||
const PAGE = 'q';
|
||||
const DOC_INFO = 'r';
|
||||
const TITLE = 's';
|
||||
const DESCRIPTION = 't';
|
||||
const THUMB = 'u';
|
||||
const CRAWL_DELAY = 'v';
|
||||
const LINKS = 'w';
|
||||
const ROBOT_TXT = 'x';
|
||||
const TO_CRAWL = 'y';
|
||||
const INDEX = 'z';
|
||||
const AVERAGE_TITLE_LENGTH = 'A';
|
||||
const AVERAGE_DESCRIPTION_LENGTH = 'B';
|
||||
const AVERAGE_TOTAL_LINK_TEXT_LENGTH = 'C';
|
||||
const TITLE_LENGTH = 'D';
|
||||
const DESCRIPTION_LENGTH = 'E';
|
||||
const LINK_LENGTH = 'F';
|
||||
const TITLE_WORDS = 'G';
|
||||
const DESCRIPTION_WORDS = 'H';
|
||||
const LINK_WORDS = 'I';
|
||||
const TITLE_WORD_SCORE = 'J';
|
||||
const DESCRIPTION_WORD_SCORE = 'K';
|
||||
const LINK_WORD_SCORE = 'L';
|
||||
const DOC_DEPTH = 'M';
|
||||
const DOC_RANK = 'N';
|
||||
const URL_WEIGHT = 'O';
|
||||
const INLINKS = 'P';
|
||||
const NEW_CRAWL = 'Q';
|
||||
const OFFSET = 'R';
|
||||
const PATHS = 'S';
|
||||
const HASH_URL = 'T';
|
||||
const SUMMARY_OFFSET = 'U';
|
||||
const DUMMY = 'V';
|
||||
const SITES = 'W';
|
||||
const SCORE = 'X';
|
||||
const CRAWL_ORDER = 'Y';
|
||||
const RESTRICT_SITES_BY_URL = 'Z';
|
||||
const ALLOWED_SITES = 'aa';
|
||||
const DISALLOWED_SITES = 'ab';
|
||||
const BREADTH_FIRST = 'ac';
|
||||
const PAGE_IMPORTANCE = 'ad';
|
||||
const MACHINE_URI = 'ae';
|
||||
const SITE_INFO = 'af';
|
||||
const FILETYPE = 'ag';
|
||||
const SUMMARY = 'ah';
|
||||
const URL_INFO = 'ai';
|
||||
const HASH_SEEN_URLS ='aj';
|
||||
const RECENT_URLS ='ak';
|
||||
const MEMORY_USAGE ='al';
|
||||
const DOC_ID ='am';
|
||||
const RELEVANCE ='an';
|
||||
const PAGE_RULES ='ao';
|
||||
const CACHE_PAGE_PARTITION = 'ap';
|
||||
const GENERATION = 'aq';
|
||||
const HASH_SUM_SCORE = 'ar';
|
||||
const HASH_URL_COUNT = 'as'; //not used
|
||||
const IS_DOC = 'at';
|
||||
const BOOST = 'av';
|
||||
const IP_ADDRESSES = 'au';
|
||||
const JUST_METAS = 'aw';
|
||||
const WEB_CRAWL = 'ax';
|
||||
const ARCHIVE_CRAWL = 'ay';
|
||||
const CRAWL_TYPE = 'az';
|
||||
const CRAWL_INDEX = 'ba';
|
||||
const HEADER = 'bb';
|
||||
const SERVER = 'bc';
|
||||
const SERVER_VERSION = 'bd';
|
||||
const OPERATING_SYSTEM = 'be';
|
||||
const MODIFIED = 'bf';
|
||||
const LANG = 'bg';
|
||||
const ROBOT_INSTANCE = 'bh';
|
||||
const DOC_LEN = 'bi';
|
||||
const SUBDOCS = 'bj';
|
||||
const SUBDOCTYPE = 'bk';
|
||||
const INDEXING_PLUGINS = 'bl';
|
||||
const DOMAIN_WEIGHTS = 'bm';
|
||||
const POSITION_LIST = 'bn';
|
||||
const PROXIMITY = 'bo';
|
||||
const LOCATION = 'bp';
|
||||
const INDEXED_FILE_TYPES = 'bq';
|
||||
const PAGE_RANGE_REQUEST = 'br';
|
||||
const PAGE_RECRAWL_FREQUENCY = 'bs';
|
||||
const DATA = 'bt';
|
||||
const QUEUE_SERVERS = "bu";
|
||||
const CURRENT_SERVER = "bv";
|
||||
const SIZE = "bw";
|
||||
const TOTAL_TIME = "bx";
|
||||
const DNS_TIME = "by";
|
||||
const AGENT_LIST = "bz";
|
||||
const ROBOT_METAS = "ca";
|
||||
const ARC_DIR = "cb";
|
||||
const ARC_TYPE = "cc";
|
||||
const ARC_DATA = "cd";
|
||||
const KEY = "ce";
|
||||
const MACHINE_ID = 'cf';
|
||||
const VIDEO_SOURCES = 'cg';
|
||||
const IS_FEED = 'ch';
|
||||
const SOURCE_NAME = 'ci';
|
||||
const LINK_SEEN_URLS = 'cj';
|
||||
const POST_MAX_SIZE = 'ck';
|
||||
const LOGGING = 'cl';
|
||||
const META_WORDS = 'cm';
|
||||
const CACHE_PAGES = 'cn';
|
||||
const WARC_ID = 'co';
|
||||
const START_PARTITION = 'cp';
|
||||
const INI = 'cq';
|
||||
const UI_FLAGS = 'cr';
|
||||
const KEYWORD_LINKS = 'cs';
|
||||
const END_ITERATOR = 'ct';
|
||||
const ACTIVE_CLASSIFIERS = 'cu';
|
||||
const ACTIVE_CLASSIFIERS_DATA = 'cv';
|
||||
const MAX_DESCRIPTION_LEN = 'cw';
|
||||
const CACHE_PAGE_VALIDATORS = 'cx';
|
||||
const CACHE_PAGE_VALIDATION_DATA = 'cy';
|
||||
const NUM_PARTITIONS = 'cz';
|
||||
const PARTITION_NUM = 'da';
|
||||
const ACTIVE_RANKERS = 'db';
|
||||
const USER_RANKS = "dc";
|
||||
const INDEXING_PLUGINS_DATA = "dd";
|
||||
const REPOSITORY_TYPE = 'de';
|
||||
const FILE_NAME = 'df';
|
||||
const SHA_HASH = 'dg';
|
||||
const TOR_PROXY = 'dh';
|
||||
const PROXY_SERVERS = 'di';
|
||||
const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;
|
||||
const BASIC_SUMMARIZER = 'dk';
|
||||
const CENTROID_SUMMARIZER = 'dl';
|
||||
const SUMMARIZER_OPTION = 'dm';
|
||||
const WORD_CLOUD = 'dn';
|
||||
const THESAURUS_SCORE ='do';
|
||||
const IS_GOPHER_URL = "dp";
|
||||
const MINIMUM_FETCH_LOOP_TIME = "dq";
|
||||
const IMAGE_LINK = "dr";
|
||||
const GRAPH_BASED_SUMMARIZER = 'ds';
|
||||
const CENTROID_WEIGHTED_SUMMARIZER = 'dt';
|
||||
const SCRAPER_LABEL = 'du';
|
||||
const SCRAPERS = 'dv';
|
||||
const IS_NEWS = "dw";
|
||||
const QUESTION_ANSWERS = 'dx';
|
||||
const CONTENT_SIZE = 'dy';
|
||||
const NO_RANGE = 'dz';
|
||||
}
|
371
src/library/CrawlDaemon.php
Normal file
371
src/library/CrawlDaemon.php
Normal file
|
@ -0,0 +1,371 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
|
||||
/**
|
||||
* Load the crawlLog function
|
||||
*/
|
||||
require_once C\BASE_DIR."/library/Utility.php";
|
||||
/**
|
||||
* Used to run scripts as a daemon on *nix systems
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class CrawlDaemon implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Name prefix to be used on files associated with this daemon
|
||||
* (such as lock like and messages)
|
||||
* @var string
|
||||
* @static
|
||||
*/
|
||||
public static $name;
|
||||
/**
|
||||
* Subname of the name prefix used on files associated with this daemon
|
||||
* For example, the name might be fetcher, the subname might 2 to indicate
|
||||
* which fetcher daemon instance.
|
||||
*
|
||||
* @var string
|
||||
* @static
|
||||
*/
|
||||
public static $subname;
|
||||
/**
|
||||
* Used by processHandler to decide whether run as daemon or not
|
||||
* @var string
|
||||
* @static
|
||||
*/
|
||||
public static $mode;
|
||||
/**
|
||||
* Tick callback function used to update the timestamp in this processes
|
||||
* lock. If lock_file does not exist or more than PROCESS_TIMEOUT
|
||||
* time has elapsed since the last processHandler call it stops the process
|
||||
*
|
||||
* @param bool $continue if true only stop if lock file not present,
|
||||
* ignore PROCESS_TIMEOUT time being exceeded.
|
||||
*/
|
||||
public static function processHandler($continue = false)
|
||||
{
|
||||
static $time = 0;
|
||||
if (self::$mode != 'daemon') {
|
||||
return true;
|
||||
}
|
||||
$lock_file = CrawlDaemon::getLockFileName(self::$name, self::$subname);
|
||||
$now = time();
|
||||
if ($time == 0 ) {
|
||||
$time = $now;
|
||||
}
|
||||
$lock_exist = file_exists($lock_file);
|
||||
if (!$lock_exist || ($now - $time) > C\PROCESS_TIMEOUT) {
|
||||
$name_string = CrawlDaemon::getNameString(self::$name,
|
||||
self::$subname);
|
||||
if (($now - $time) > C\PROCESS_TIMEOUT) {
|
||||
crawlLog($name_string.": ".($now - $time) .
|
||||
" seconds has elapsed since processHandler last called.",
|
||||
null, true);
|
||||
crawlLog("Timeout exceeded...", null, true);
|
||||
}
|
||||
if (!$lock_exist || !$continue) {
|
||||
crawlLog("Stopping $name_string ...", null, true);
|
||||
exit();
|
||||
}
|
||||
}
|
||||
$time = $now;
|
||||
file_put_contents($lock_file, $now);
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Used to send a message the given daemon or run the program in the
|
||||
* foreground.
|
||||
*
|
||||
* @param array $argv an array of command line arguments. The argument
|
||||
* start will check if the process control functions exists if these
|
||||
* do they will fork and detach a child process to act as a daemon.
|
||||
* a lock file will be created to prevent additional daemons from
|
||||
* running. If the message is stop then a message file is written to
|
||||
* tell the daemon to stop. If the argument is terminal then the
|
||||
* program won't be run as a daemon.
|
||||
* @param string $name the prefix to use for lock and message files
|
||||
* @param int $exit_type whether this function should exit > 0 or return (1)
|
||||
* by default a lock file is only written if exit (this allows
|
||||
* both queue server processes (Indexer and Scheduler) to use the
|
||||
* same lock file. If exit is >=3 or <= -3 then doesn't check lock
|
||||
* to see if already running before starting
|
||||
*/
|
||||
public static function init($argv, $name, $exit_type = 1)
|
||||
{
|
||||
self::$name = $name;
|
||||
if (isset($argv[2]) && $argv[2] != "none") {
|
||||
self::$subname = $argv[2];
|
||||
} else {
|
||||
self::$subname = "";
|
||||
}
|
||||
//don't let our script be run from apache
|
||||
if (isset($_SERVER['DOCUMENT_ROOT']) &&
|
||||
strlen($_SERVER['DOCUMENT_ROOT']) > 0) {
|
||||
echo "BAD REQUEST";
|
||||
exit();
|
||||
}
|
||||
if (!isset($argv[1])) {
|
||||
echo "$name needs to be run with a command-line argument.\n";
|
||||
echo "For example,\n";
|
||||
echo "php $name.php start //starts the $name as a daemon\n";
|
||||
echo "php $name.php stop //stops the $name daemon\n";
|
||||
echo "php $name.php terminal //runs $name within the current ".
|
||||
"process, not as a daemon, output going to the terminal\n";
|
||||
exit();
|
||||
}
|
||||
$messages_file = self::getMesssageFileName(self::$name, self::$subname);
|
||||
switch ($argv[1]) {
|
||||
case "start":
|
||||
$options = "";
|
||||
for ($i = 3; $i < count($argv); $i++) {
|
||||
$options .= " ".$argv[$i];
|
||||
}
|
||||
$subname = (!isset($argv[2]) || $argv[2] == 'none') ?
|
||||
'none' :self::$subname;
|
||||
$name_prefix = (isset($argv[3])) ? $argv[3] : self::$subname;
|
||||
$name_string = CrawlDaemon::getNameString($name, $name_prefix);
|
||||
echo "Starting $name_string...\n";
|
||||
CrawlDaemon::start($name, $subname, $options, $exit_type);
|
||||
break;
|
||||
case "stop":
|
||||
CrawlDaemon::stop($name, self::$subname);
|
||||
break;
|
||||
case "terminal":
|
||||
self::$mode = 'terminal';
|
||||
$info = [];
|
||||
$info[self::STATUS] = self::WAITING_START_MESSAGE_STATE;
|
||||
file_put_contents($messages_file, serialize($info));
|
||||
chmod($messages_file, 0777);
|
||||
C\nsdefine("LOG_TO_FILES", false);
|
||||
break;
|
||||
case "child":
|
||||
self::$mode = 'daemon';
|
||||
$info = [];
|
||||
$info[self::STATUS] = self::WAITING_START_MESSAGE_STATE;
|
||||
file_put_contents($messages_file, serialize($info));
|
||||
chmod($messages_file, 0777);
|
||||
C\nsdefine("LOG_TO_FILES", true);
|
||||
// if false log messages are sent to the console
|
||||
break;
|
||||
default:
|
||||
exit();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to start a daemon running in the background
|
||||
*
|
||||
* @param string $name the main name of this daemon such as queue_server
|
||||
* or fetcher.
|
||||
* @param string $subname the instance name if it is possible for more
|
||||
* than one copy of the daemon to be running at the same time
|
||||
* @param string $options a string of additional command line options
|
||||
* @param int $exit whether this function should exit > 0 or return (1)
|
||||
* by default a lock file is only written if exit (this allows
|
||||
* both queue server processes (Indexer and Scheduler) to use the
|
||||
* same lock file. If exit is >=3 or <= -3 then doesn't check lock
|
||||
* to see if already running before starting
|
||||
*/
|
||||
public static function start($name, $subname = "", $options = "", $exit = 1)
|
||||
{
|
||||
$tmp_subname = ($subname == 'none') ? '' : $subname;
|
||||
$lock_file = CrawlDaemon::getLockFileName($name, $tmp_subname);
|
||||
if (file_exists($lock_file) && ($exit < 3 && $exit > -3)) {
|
||||
$time = intval(file_get_contents($lock_file));
|
||||
if (time() - $time < C\PROCESS_TIMEOUT) {
|
||||
echo "$name appears to be already running...\n";
|
||||
echo "Try stopping it first, then running start.";
|
||||
exit();
|
||||
}
|
||||
}
|
||||
$php = "php";
|
||||
if (C\nsdefined("PHP_PATH") ) {
|
||||
$php = C\PHP_PATH."/".$php;
|
||||
}
|
||||
/* make sure hhvm has write access to the folder
|
||||
of the owner of the webserver process so it can write
|
||||
a .hhvm.hhbc file
|
||||
*/
|
||||
if (function_exists("posix_getpwuid")) {
|
||||
$process_user_info = posix_getpwuid(posix_getuid());
|
||||
$process_home = $process_user_info['dir'];
|
||||
if (C\nsdefined("FORCE_HHVM") || (
|
||||
stristr(phpversion(), "hhvm") !==false &&
|
||||
posix_access($process_home, POSIX_W_OK))) {
|
||||
$php = 'hhvm -f ';
|
||||
if (C\nsdefined("HHVM_PATH") ) {
|
||||
$php = C\HHVM_PATH."/".$php;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strstr(PHP_OS, "WIN")) {
|
||||
$base_dir = str_replace("/", "\\", C\BASE_DIR);
|
||||
$script = "start /B $php ".
|
||||
$base_dir."\\executables\\$name.php child %s";
|
||||
} else {
|
||||
$script = "$php '".
|
||||
C\BASE_DIR."/executables/$name.php' child %s < /dev/null ".
|
||||
" > /dev/null &";
|
||||
}
|
||||
$total_options = "$subname $options";
|
||||
$at_job = sprintf($script, $total_options);
|
||||
pclose(popen($at_job, "r"));
|
||||
if ($exit != 0) {
|
||||
file_put_contents($lock_file, time());
|
||||
}
|
||||
if ($exit > 0) {
|
||||
exit();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to execute a shell command in its own process
|
||||
*
|
||||
* @param string $cmd the command to execute
|
||||
*/
|
||||
public static function execInOwnProcess($cmd)
|
||||
{
|
||||
if (strstr(PHP_OS, "WIN")) {
|
||||
$job = "start /B $cmd ";
|
||||
} else {
|
||||
$job = "$cmd < /dev/null > /dev/null &";
|
||||
}
|
||||
pclose(popen($job, "r"));
|
||||
}
|
||||
/**
|
||||
* Used to stop a daemon that is running in the background
|
||||
*
|
||||
* @param string $name the main name of this daemon such as queue_server
|
||||
* or fetcher.
|
||||
* @param string $subname the instance name if it is possible for more
|
||||
* than one copy of the daemon to be running at the same time
|
||||
* @param bool $exit whether this method should just return (false) or
|
||||
* call exit() (true)
|
||||
*/
|
||||
public static function stop($name, $subname = "", $exit = true)
|
||||
{
|
||||
$name_string = CrawlDaemon::getNameString($name, $subname);
|
||||
$lock_file = CrawlDaemon::getLockFileName($name, $subname);
|
||||
$not_web_setting = (php_sapi_name() == 'cli');
|
||||
if (file_exists($lock_file)) {
|
||||
unlink($lock_file);
|
||||
if ($not_web_setting) {
|
||||
crawlLog("Sending stop signal to $name_string...");
|
||||
}
|
||||
} else if ($not_web_setting) {
|
||||
crawlLog("$name_string does not appear to running...");
|
||||
}
|
||||
if ($exit) {
|
||||
exit();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to return the string name of the messages file used to pass
|
||||
* messages to a daemon running in the background
|
||||
*
|
||||
* @param string $name the main name of this daemon such as queue_server
|
||||
* or fetcher.
|
||||
* @param string $subname the instance name if it is possible for more
|
||||
* than one copy of the daemon to be running at the same time
|
||||
*
|
||||
* @return string the name of the message file for the daemon with
|
||||
* the given name and subname
|
||||
*/
|
||||
public static function getMesssageFileName($name, $subname = "")
|
||||
{
|
||||
return C\CRAWL_DIR."/schedules/".self::getNameString($name, $subname)
|
||||
. "Messages.txt";
|
||||
}
|
||||
/**
|
||||
* Used to return the string name of the lock file used to pass
|
||||
* by a daemon
|
||||
*
|
||||
* @param string $name the main name of this daemon such as queue_server
|
||||
* or fetcher.
|
||||
* @param string $subname the instance name if it is possible for more
|
||||
* than one copy of the daemon to be running at the same time
|
||||
*
|
||||
* @return string the name of the lock file for the daemon with
|
||||
* the given name and subname
|
||||
*/
|
||||
public static function getLockFileName($name, $subname = "")
|
||||
{
|
||||
return C\CRAWL_DIR."/schedules/".self::getNameString($name, $subname)
|
||||
. "Lock.txt";
|
||||
}
|
||||
/**
|
||||
* Used to return a string name for a given daemon instance
|
||||
*
|
||||
* @param string $name the main name of this daemon such as queue_server
|
||||
* or fetcher.
|
||||
* @param string $subname the instance name if it is possible for more
|
||||
* than one copy of the daemon to be running at the same time
|
||||
*
|
||||
* @return string a single name that combines the name and subname
|
||||
*/
|
||||
public static function getNameString($name, $subname)
|
||||
{
|
||||
return ($subname == "") ? $name : $subname."-".$name;
|
||||
}
|
||||
/**
|
||||
* Returns the statuses of the running daemons
|
||||
*
|
||||
* @return array 2d array active_daemons[name][instance] = true
|
||||
*/
|
||||
public static function statuses()
|
||||
{
|
||||
$prefix = C\CRAWL_DIR . "/schedules/";
|
||||
$prefix_len = strlen($prefix);
|
||||
$suffix = "Lock.txt";
|
||||
$suffix_len = strlen($suffix);
|
||||
$lock_files = "$prefix*$suffix";
|
||||
clearstatcache();
|
||||
$time = time();
|
||||
$active_daemons = [];
|
||||
foreach (glob($lock_files) as $file) {
|
||||
if ($time - filemtime($file) < C\PROCESS_TIMEOUT) {
|
||||
$len = strlen($file) - $suffix_len - $prefix_len;
|
||||
$pre_name = substr($file, $prefix_len, $len);
|
||||
$pre_name_parts = explode("-", $pre_name);
|
||||
if (count($pre_name_parts) == 1) {
|
||||
$active_daemons[$pre_name][-1] = 1;
|
||||
} else {
|
||||
$first = array_shift($pre_name_parts);
|
||||
$rest = implode("-", $pre_name_parts);
|
||||
$active_daemons[$rest][$first] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $active_daemons;
|
||||
}
|
||||
}
|
622
src/library/FetchGitRepositoryUrls.php
Normal file
622
src/library/FetchGitRepositoryUrls.php
Normal file
|
@ -0,0 +1,622 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Snigdha Rao Parvatneni
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Library of functions used to fetch Git internal urls
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class FetchGitRepositoryUrls implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* A list of meta words that might be extracted from a query
|
||||
* @var array
|
||||
*/
|
||||
public static $repository_types = ['git' => 'git', 'svn' => 'svn',
|
||||
'cvs' => 'cvs', 'vss' => 'vss', 'mercurial' => 'mercurial',
|
||||
'monotone' => 'monotone', 'bazaar' => 'bazaar', 'darcs' => 'darcs',
|
||||
'arch' => 'arch'];
|
||||
/**
|
||||
* An array used to store all the Git internal urls
|
||||
* @var array
|
||||
*/
|
||||
public static $all_git_urls;
|
||||
/**
|
||||
* An indicator to tell no actions to be taken
|
||||
*/
|
||||
const INDICATOR_NONE = 'none';
|
||||
/**
|
||||
* An indicator to indicate git repository
|
||||
*/
|
||||
const INDICATOR_GIT = 'git';
|
||||
/**
|
||||
* An indicator to tell more git urls need to be fetched
|
||||
*/
|
||||
const GIT_URL_CONTINUE = '@@@@';
|
||||
/**
|
||||
* An indicator to tell starting position of Git url to be used
|
||||
*/
|
||||
const GIT_BASE_URL_START = 0;
|
||||
/**
|
||||
* An indicator to tell ending position of Git url to be used
|
||||
*/
|
||||
const GIT_BASE_URL_END = '###';
|
||||
/**
|
||||
* A fixed component to be used with Git base url to form Git first url
|
||||
*/
|
||||
const GIT_URL_EXTENSION = 'info/refs?service=git-upload-pack';
|
||||
/**
|
||||
* A fixed component to be used with Git urls to get next Git urls
|
||||
*/
|
||||
const GIT_URL_OBJECT = 'objects/';
|
||||
/**
|
||||
* A fixed indicator used to get last letter of git base url
|
||||
*/
|
||||
const GIT_BASE_URL_END_POSITION = -1;
|
||||
/**
|
||||
* A fixed indicator used to get last letter of git base url
|
||||
*/
|
||||
const GIT_BASE_END_LETTER = 1;
|
||||
/**
|
||||
* A fixed position used to indicate starting point to fetch next Git url
|
||||
* from the master file
|
||||
*/
|
||||
const GIT_NEXT_URL_START = 0;
|
||||
/**
|
||||
* A fixed position used to indicate ending position to fetch next Git url
|
||||
* from the master file
|
||||
*/
|
||||
const GIT_NEXT_URL_END = 40;
|
||||
/**
|
||||
* A fixed indicator used to make desired Git folder structure from SHA hash
|
||||
*/
|
||||
const GIT_URL_SPLIT = '/';
|
||||
/**
|
||||
* A fixed indicator used to mark starting position of SHA hash of Git
|
||||
* master tree
|
||||
*/
|
||||
const GIT_MASTER_TREE_HASH_START = 16;
|
||||
/**
|
||||
* A fixed indicator used to mark ending position of SHA hash of Git
|
||||
* master tree
|
||||
*/
|
||||
const GIT_MASTER_TREE_HASH_END = 41;
|
||||
/**
|
||||
* A fixed indicator used to mark starting position of SHA hash used to
|
||||
* indicate Git object folder
|
||||
*/
|
||||
const GIT_FOLDER_NAME_START = 0;
|
||||
/**
|
||||
* A fixed indicator used to mark ending position of SHA hash used to
|
||||
* indicate Git object folder
|
||||
*/
|
||||
const GIT_FOLDER_NAME_END = 2;
|
||||
/**
|
||||
* A fixed indicator used to mark starting position of SHA hash used to
|
||||
* indicate Git object file
|
||||
*/
|
||||
const GIT_FILE_NAME_START = 2;
|
||||
/**
|
||||
* A fixed indicator used to mark ending position of SHA hash used to
|
||||
* indicate Git object file
|
||||
*/
|
||||
const GIT_FILE_NAME_END = 38;
|
||||
/**
|
||||
* A fixed indicator used to indicate Git blob object
|
||||
*/
|
||||
const GIT_BLOB_OBJECT = "blob";
|
||||
/**
|
||||
* A fixed indicator used to indicate Git tree object
|
||||
*/
|
||||
const GIT_TREE_OBJECT = "tree";
|
||||
/**
|
||||
* A cURL time out parameter
|
||||
*/
|
||||
const CURL_TIMEOUT = 5;
|
||||
/**
|
||||
* A cURL transfer parameter
|
||||
*/
|
||||
const CURL_TRANSFER = 1;
|
||||
/**
|
||||
* Git blob access code starting position
|
||||
*/
|
||||
const BLOB_ACCESS_CODE_START = 0;
|
||||
/**
|
||||
* Git blob access code ending position
|
||||
*/
|
||||
const BLOB_ACCESS_CODE_END = 6;
|
||||
/**
|
||||
* Git tree access code starting position
|
||||
*/
|
||||
const TREE_ACCESS_CODE_START = 0;
|
||||
/**
|
||||
* Git tree access code ending position
|
||||
*/
|
||||
const TREE_ACCESS_CODE_END = 5;
|
||||
/**
|
||||
* Git SHA hash binary starting position
|
||||
*/
|
||||
const SHA_HASH_BINARY_START = 0;
|
||||
/**
|
||||
* Git SHA hash binary ending position
|
||||
*/
|
||||
const SHA_HASH_BINARY_END = 20;
|
||||
/**
|
||||
* A indicator for starting of Git file or folder name
|
||||
*/
|
||||
const GIT_NAME_START = 0;
|
||||
/**
|
||||
* A indicator to represent next position after the access code in Git
|
||||
* blob object
|
||||
*/
|
||||
const GIT_BLOB_NEXT = 7;
|
||||
/**
|
||||
* A indicator to represent next position after the access code in Git
|
||||
* tree object
|
||||
*/
|
||||
const GIT_TREE_NEXT = 6;
|
||||
/**
|
||||
* A indicator to represent next position after the access code in Git
|
||||
* tree object
|
||||
*/
|
||||
const HEX_NULL_CHARACTER = "\x00";
|
||||
/**
|
||||
* A indicator to represent that a git file is a blob file
|
||||
*/
|
||||
const GIT_BLOB_INDICATOR = '100';
|
||||
/**
|
||||
* A indicator to represent that a git file is a tree file
|
||||
*/
|
||||
const GIT_TREE_INDICATOR = '400';
|
||||
/**
|
||||
* Checks repository type based on extension
|
||||
*
|
||||
* @param string $extension to check
|
||||
* @return string $repository_type repository type based on the
|
||||
* extension of urls
|
||||
*/
|
||||
public static function checkForRepository($extension)
|
||||
{
|
||||
if (isset(self::$repository_types[$extension])) {
|
||||
$repository_type = self::$repository_types[$extension];
|
||||
} else {
|
||||
$repository_type = self::INDICATOR_NONE;
|
||||
}
|
||||
return $repository_type;
|
||||
}
|
||||
/**
|
||||
* Sets up the seed sites with urls from a git repository (updates
|
||||
* these sites if have already started downloading from repository)
|
||||
*
|
||||
* @param string $url_to_check url needs to be processed
|
||||
* @param int $counter to keep track of number of urls processed
|
||||
* @param array $seeds store sites which are ready to be downloaded
|
||||
* @param array $repository_indicator indicates the type of the repository
|
||||
* @param array $site_pair contains original Git url crawled
|
||||
* @param int $total_git_urls number of urls in repository less those
|
||||
* already processed
|
||||
* @param array $all_git_urls current list of urls from git repository
|
||||
* @return array $git_internal_urls containing all the internal Git urls
|
||||
* fetched from the parent Git url
|
||||
*/
|
||||
public static function setGitRepositoryUrl($url_to_check, $counter, $seeds,
|
||||
$repository_indicator, $site_pair, $total_git_urls, $all_git_urls)
|
||||
{
|
||||
$git_internal_urls = [];
|
||||
if (!strpos($url_to_check, self::GIT_URL_CONTINUE)) {
|
||||
$git_next_urls = self::fetchGitRepositoryUrl($url_to_check);
|
||||
$all_git_urls = $git_next_urls;
|
||||
$total_git_urls = count($all_git_urls);
|
||||
$count_all_git_urls = $total_git_urls;
|
||||
if (intval(C\NUM_MULTI_CURL_PAGES) - $counter < $total_git_urls) {
|
||||
$total_git_urls = intval(C\NUM_MULTI_CURL_PAGES) - $counter;
|
||||
}
|
||||
for ($j = 0; $j < $total_git_urls; $j++) {
|
||||
$seeds[$counter][self::URL] = $git_next_urls[$j][2];
|
||||
$seeds[$counter][self::WEIGHT] = $site_pair['value'][1];
|
||||
$seeds[$counter][self::CRAWL_DELAY] = $site_pair['value'][2];
|
||||
$seeds[$counter][self::REPOSITORY_TYPE] = $repository_indicator;
|
||||
$seeds[$counter][self::FILE_NAME] = $git_next_urls[$j][0];
|
||||
$seeds[$counter][self::SHA_HASH] = $git_next_urls[$j][1];
|
||||
$counter++;
|
||||
$git_url_index = $j + 1;
|
||||
if ($git_url_index >= $count_all_git_urls) {
|
||||
$repository_indicator = self::INDICATOR_NONE;
|
||||
} else {
|
||||
$repository_indicator = self::INDICATOR_GIT;
|
||||
}
|
||||
}
|
||||
$counter--;
|
||||
} else {
|
||||
$position = strpos($url_to_check, self::GIT_URL_CONTINUE);
|
||||
$extension_string = substr($url_to_check, $position,
|
||||
strlen($url_to_check));
|
||||
$extension_count = explode(self::GIT_URL_CONTINUE,
|
||||
$extension_string);
|
||||
$git_index = intval(array_sum($extension_count));
|
||||
$url_to_check = substr($url_to_check, self::GIT_NEXT_URL_START,
|
||||
$position);
|
||||
$count_all_git_urls = $total_git_urls;
|
||||
if (intval(C\NUM_MULTI_CURL_PAGES) - $counter < $total_git_urls -
|
||||
$git_index) {
|
||||
$total_git_urls = intval(C\NUM_MULTI_CURL_PAGES) - $counter;
|
||||
} else {
|
||||
$total_git_urls = $total_git_urls - $git_index;
|
||||
}
|
||||
for ($j = 0; $j < $total_git_urls; $j++) {
|
||||
$seeds[$counter][self::URL] = $all_git_urls[$git_index][2];
|
||||
$seeds[$counter][self::WEIGHT] = $site_pair['value'][1];
|
||||
$seeds[$counter][self::CRAWL_DELAY] = $site_pair['value'][2];
|
||||
$seeds[$counter][self::REPOSITORY_TYPE] = $repository_indicator;
|
||||
$seeds[$counter][self::FILE_NAME] = $all_git_urls[$git_index]
|
||||
[0];
|
||||
$seeds[$counter][self::SHA_HASH] = $all_git_urls[$git_index][1];
|
||||
$counter++;
|
||||
$git_index++;
|
||||
$git_url_index = $j + 1;
|
||||
if ($git_index >= $count_all_git_urls) {
|
||||
$repository_indicator = self::INDICATOR_NONE;
|
||||
} else {
|
||||
$repository_indicator = self::INDICATOR_GIT;
|
||||
}
|
||||
}
|
||||
$counter--;
|
||||
}
|
||||
$git_internal_urls['position'] = $counter;
|
||||
$git_internal_urls['index'] = $git_url_index;
|
||||
$git_internal_urls['seeds'] = $seeds;
|
||||
$git_internal_urls['indicator'] = $repository_indicator;
|
||||
$git_internal_urls['count'] = $count_all_git_urls;
|
||||
$git_internal_urls['all'] = $all_git_urls;
|
||||
return $git_internal_urls;
|
||||
}
|
||||
/**
|
||||
* Get the Git internal urls from the parent Git url
|
||||
*
|
||||
* @param string $url_to_check url needs to be processed
|
||||
* @return an array $git_next_urls consists of list of Git
|
||||
* internal urls wich are called during the git clone
|
||||
*/
|
||||
public static function fetchGitRepositoryUrl($url_to_check)
|
||||
{
|
||||
$compression_indicator = false;
|
||||
$position = strpos($url_to_check, self::GIT_BASE_URL_END);
|
||||
$git_base_url = substr($url_to_check, self::GIT_BASE_URL_START,
|
||||
$position);
|
||||
$base_url_last_letter = substr($git_base_url,
|
||||
self::GIT_BASE_URL_END_POSITION, self::GIT_BASE_END_LETTER);
|
||||
if ($base_url_last_letter != self::GIT_URL_SPLIT) {
|
||||
$git_base_url = $git_base_url . self::GIT_URL_SPLIT;
|
||||
}
|
||||
$git_first_url = $git_base_url.self::GIT_URL_EXTENSION;
|
||||
$git_first_url_content = self::getNextGitUrl($git_first_url,
|
||||
$compression_indicator);
|
||||
$compression_indicator = true;
|
||||
$git_second_url = self::getGitMasterFile($git_first_url_content,
|
||||
$git_base_url);
|
||||
$git_second_url_content = self::getNextGitUrl($git_second_url,
|
||||
$compression_indicator);
|
||||
$git_third_url = self::getGitMasterTree($git_second_url_content,
|
||||
$git_base_url);
|
||||
$git_third_url_content = self::getNextGitUrl($git_third_url,
|
||||
$compression_indicator);
|
||||
$git_next_urls = self::getObjects($git_third_url_content,
|
||||
$git_base_url);
|
||||
return $git_next_urls;
|
||||
}
|
||||
/**
|
||||
* Get the Git second url which points to Git master tree structure
|
||||
*
|
||||
* @param string $git_first_url_content contents of Git first url
|
||||
* @param string $git_base_url common portion of Git urls
|
||||
* @return string $git_next_url consists of second internal Git url
|
||||
*/
|
||||
public static function getGitMasterFile($git_first_url_content,
|
||||
$git_base_url)
|
||||
{
|
||||
$git_extended_url = substr($git_first_url_content,
|
||||
self::GIT_NEXT_URL_START, self::GIT_NEXT_URL_END);
|
||||
$first_split_git_extended_url = substr($git_extended_url,
|
||||
self::GIT_FOLDER_NAME_START, self::GIT_FOLDER_NAME_END);
|
||||
$second_split_git_extended_url = substr($git_extended_url,
|
||||
self::GIT_FILE_NAME_START, self::GIT_FILE_NAME_END);
|
||||
$git_url_connector = $first_split_git_extended_url .
|
||||
self::GIT_URL_SPLIT . $second_split_git_extended_url;
|
||||
$git_next_url = $git_base_url . self::GIT_URL_OBJECT .
|
||||
$git_url_connector;
|
||||
return $git_next_url;
|
||||
}
|
||||
/**
|
||||
* Get the Git third url which contains the information about the
|
||||
* organization of entire git repository
|
||||
*
|
||||
* @param string $git_second_url_content contents of Git second url
|
||||
* @param string $git_base_url common portion of git urls
|
||||
* @return string $git_next_url consists of third internal git url
|
||||
*/
|
||||
public static function getGitMasterTree($git_second_url_content,
|
||||
$git_base_url)
|
||||
{
|
||||
$git_master_tree_hash = substr($git_second_url_content,
|
||||
self::GIT_MASTER_TREE_HASH_START, self::GIT_MASTER_TREE_HASH_END);
|
||||
$git_object_folder_name = substr($git_master_tree_hash,
|
||||
self::GIT_FOLDER_NAME_START, self::GIT_FOLDER_NAME_END);
|
||||
$git_object_file_name = substr($git_master_tree_hash,
|
||||
self::GIT_FILE_NAME_START, self::GIT_FILE_NAME_END);
|
||||
$git_object_path = $git_object_folder_name . self::GIT_URL_SPLIT .
|
||||
$git_object_file_name;
|
||||
$git_next_url = $git_base_url . self::GIT_URL_OBJECT . $git_object_path;
|
||||
return $git_next_url;
|
||||
}
|
||||
/**
|
||||
* Get the Git content from url which will be used to get the
|
||||
* next git url
|
||||
*
|
||||
* @param string $git_url git url to extract contents from it
|
||||
* @param string $compression_indicator indicator for compress and
|
||||
* uncompress contents
|
||||
* @return string $git_object_content consists contents extracted from the
|
||||
* url
|
||||
*/
|
||||
public static function getNextGitUrl($git_url, $compression_indicator)
|
||||
{
|
||||
if (!$compression_indicator) {
|
||||
$git_object_compress_content = self::getGitdata($git_url);
|
||||
$git_object_content = $git_object_compress_content;
|
||||
} else {
|
||||
$git_object_compress_content = self::getGitdata($git_url);
|
||||
$git_object_uncompress_content = gzuncompress(
|
||||
$git_object_compress_content);
|
||||
$git_object_content = $git_object_uncompress_content;
|
||||
}
|
||||
return $git_object_content;
|
||||
}
|
||||
/**
|
||||
* Get the Git blob and tree objects
|
||||
*
|
||||
* @param string $git_object_content compressed content of git master tree
|
||||
* file
|
||||
* @param string $git_base_url common content of git url
|
||||
* @return array $blob_url contains information and url for git blob objects
|
||||
*/
|
||||
public static function getObjects($git_object_content, $git_base_url)
|
||||
{
|
||||
$blob_url = [];
|
||||
$temp_git_object_content['content'] = $git_object_content;
|
||||
for ($i = 0; $i < strlen($git_object_content); $i++) {
|
||||
$blob_position = strpos($temp_git_object_content['content'],
|
||||
self::GIT_BLOB_INDICATOR);
|
||||
$tree_position = strpos($temp_git_object_content['content'],
|
||||
self::GIT_TREE_INDICATOR);
|
||||
$git_object_positions = self::checkPosition($blob_position,
|
||||
$tree_position, $git_object_content);
|
||||
$blob_position = $git_object_positions['blob'];
|
||||
$tree_position = $git_object_positions['tree'];
|
||||
if ($blob_position < $tree_position) {
|
||||
$temp_git_object_content = self::readBlobSha(
|
||||
$temp_git_object_content['content'], $blob_position,
|
||||
strlen($temp_git_object_content['content']),
|
||||
$git_base_url);
|
||||
}
|
||||
else if ($tree_position < $blob_position) {
|
||||
$temp_git_object_content = self::readTreeSha(
|
||||
$temp_git_object_content['content'], $tree_position,
|
||||
strlen($temp_git_object_content['content']),
|
||||
$git_base_url);
|
||||
}
|
||||
$i = strlen($temp_git_object_content['content']);
|
||||
$i = strlen($git_object_content) - $i;
|
||||
if ($temp_git_object_content['value']['indicator'] !=
|
||||
self::GIT_TREE_OBJECT) {
|
||||
$blob_details[0] = $temp_git_object_content['value']['name'];
|
||||
$blob_details[1] = $temp_git_object_content['value']['hash'];
|
||||
$blob_details[2] = $temp_git_object_content['value']['url'];
|
||||
$blob_url[] = $blob_details;
|
||||
}
|
||||
if ($temp_git_object_content['indicator'] != self::GIT_BLOB_OBJECT){
|
||||
for ($k = 0; $k < count($temp_git_object_content['indicator']);
|
||||
$k++) {
|
||||
$blob_details[0] = $temp_git_object_content['indicator'][$k]
|
||||
[0];
|
||||
$blob_details[1] = $temp_git_object_content['indicator'][$k]
|
||||
[1];
|
||||
$blob_details[2] = $temp_git_object_content['indicator'][$k]
|
||||
[2];
|
||||
$blob_url[] = $temp_git_object_content['indicator'][$k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return $blob_url;
|
||||
}
|
||||
/**
|
||||
* checks the position of access code for null values
|
||||
*
|
||||
* @param string $git_blob_position first occuence of git blob access code
|
||||
* @param string $git_tree_position first occuence of git tree access code
|
||||
* @param string $git_object_content compressed content of git master tree
|
||||
* @return array $git_object_positions length of the compressed content
|
||||
* afterthe access code
|
||||
*/
|
||||
public static function checkPosition($git_blob_position, $git_tree_position,
|
||||
$git_object_content)
|
||||
{
|
||||
$git_object_positions = [];
|
||||
if (is_bool($git_blob_position) === true) {
|
||||
$git_blob_position = strlen($git_object_content);
|
||||
}
|
||||
if (is_bool($git_tree_position) === true) {
|
||||
$git_tree_position = strlen($git_object_content);
|
||||
}
|
||||
$git_object_positions['blob'] = $git_blob_position;
|
||||
$git_object_positions['tree'] = $git_tree_position;
|
||||
return $git_object_positions;
|
||||
}
|
||||
/**
|
||||
* Get the details of the blob file i.e blob file name, sha hash and content
|
||||
*
|
||||
* @param string $git_object_content compressed content of git master tree
|
||||
* @param string $blob_position first occuence of git blob access code
|
||||
* in $content
|
||||
* @param string $length length of the compressed content of git master tree
|
||||
* @param string $git_base_url common portion of git url
|
||||
* @return array $git_blob_content contains details of git blob object
|
||||
*/
|
||||
public static function readBlobSha($git_object_content, $blob_position,
|
||||
$length, $git_base_url)
|
||||
{
|
||||
$git_blob_content = [];
|
||||
$blob_values = [];
|
||||
$temp_git_content = substr($git_object_content, $blob_position,
|
||||
$length);
|
||||
$access_code = substr($temp_git_content, self::BLOB_ACCESS_CODE_START,
|
||||
self::BLOB_ACCESS_CODE_END);
|
||||
$blob_values['code'] = $access_code;
|
||||
$temp_git_content = substr($temp_git_content, self::GIT_BLOB_NEXT,
|
||||
$length);
|
||||
$temp_position = strpos($temp_git_content, self::HEX_NULL_CHARACTER);
|
||||
$file_name = substr($temp_git_content, self::GIT_NAME_START,
|
||||
$temp_position);
|
||||
$blob_values['name'] = $file_name;
|
||||
$temp_git_content = substr($temp_git_content, $temp_position + 1,
|
||||
$length);
|
||||
$sha_binary = substr($temp_git_content, self::SHA_HASH_BINARY_START,
|
||||
self::SHA_HASH_BINARY_END);
|
||||
$sha_hash = bin2hex($sha_binary);
|
||||
$blob_values['hash'] = $sha_hash;
|
||||
$temp_git_content = substr($temp_git_content, self::SHA_HASH_BINARY_END,
|
||||
$length);
|
||||
$blob_url = self::urlMaker($sha_hash, $git_base_url);
|
||||
$blob_values['url'] = $blob_url;
|
||||
$blob_values['indicator'] = self::GIT_BLOB_OBJECT;
|
||||
$git_blob_content['value'] = $blob_values;
|
||||
$git_blob_content['content'] = $temp_git_content;
|
||||
$git_blob_content['indicator'] = self::GIT_BLOB_OBJECT;
|
||||
return $git_blob_content;
|
||||
}
|
||||
/**
|
||||
* Get the details of the tree file i.e folder name, sha hash and
|
||||
* blob url inside the tree
|
||||
*
|
||||
* @param string $git_object_content compressed content of git master tree
|
||||
* @param string $tree_position first occuence of git tree access code in
|
||||
* the $content
|
||||
* @param string $length length of the compressed content of git master tree
|
||||
* @param string $git_base_url common portion of git url
|
||||
* @return array $git_tree_content contains details of git blob object
|
||||
*/
|
||||
public static function readTreeSha($git_object_content, $tree_position,
|
||||
$length, $git_base_url)
|
||||
{
|
||||
$git_tree_content = [];
|
||||
$tree_values = [];
|
||||
$temp_git_content = substr($git_object_content, $tree_position,
|
||||
$length);
|
||||
$access_code = substr($temp_git_content, self::TREE_ACCESS_CODE_START,
|
||||
self::TREE_ACCESS_CODE_END);
|
||||
$tree_values['code'] = $access_code;
|
||||
$temp_git_content = substr($temp_git_content, self::GIT_TREE_NEXT,
|
||||
$length);
|
||||
$temp_position = strpos($temp_git_content, self::HEX_NULL_CHARACTER);
|
||||
$folder_name = substr($temp_git_content, self::GIT_NAME_START,
|
||||
$temp_position);
|
||||
$tree_values['name'] = $folder_name;
|
||||
$temp_git_content = substr($temp_git_content, $temp_position + 1,
|
||||
$length);
|
||||
$sha_binary = substr($temp_git_content, self::SHA_HASH_BINARY_START,
|
||||
self::SHA_HASH_BINARY_END);
|
||||
$sha_hash = bin2hex($sha_binary);
|
||||
$tree_values['hash'] = $sha_hash;
|
||||
$tree_values['indicator'] = self::GIT_TREE_OBJECT;
|
||||
$temp_git_content = substr($temp_git_content, self::SHA_HASH_BINARY_END,
|
||||
$length);
|
||||
$blob_url = self::checkNestedStructure($sha_hash, $git_base_url);
|
||||
$git_tree_content['value'] = $tree_values;
|
||||
$git_tree_content['content'] = $temp_git_content;
|
||||
$git_tree_content['indicator'] = $blob_url;
|
||||
return $git_tree_content;
|
||||
}
|
||||
/**
|
||||
* Checks the nested structure inside git tree object
|
||||
*
|
||||
* @param string $sha_hash sha of the git tree object
|
||||
* @param string $git_base_url common portion of the parent git url
|
||||
* @return string $blob_url contains url of the blob file inside the folder
|
||||
*/
|
||||
public static function checkNestedStructure($sha_hash, $git_base_url)
|
||||
{
|
||||
$url = self::urlMaker($sha_hash, $git_base_url);
|
||||
$git_compressed_content = self::getGitData($url);
|
||||
$git_uncompressed_content = gzuncompress($git_compressed_content);
|
||||
$blob_url = self::getObjects($git_uncompressed_content, $git_base_url);
|
||||
return $blob_url;
|
||||
}
|
||||
/**
|
||||
* Makes the git clone internal url for blob objects
|
||||
*
|
||||
* @param string $sha_hash of the git blob object
|
||||
* @param string $git_base_url common portion of git url
|
||||
* @return string $git_object_url contains the complete url of the blob file
|
||||
*/
|
||||
public static function urlMaker($sha_hash, $git_base_url)
|
||||
{
|
||||
$git_object_folder = substr($sha_hash, self::GIT_FOLDER_NAME_START,
|
||||
self::GIT_FOLDER_NAME_END);
|
||||
$git_object_file = substr($sha_hash, self::GIT_FILE_NAME_START,
|
||||
self::GIT_FILE_NAME_END);
|
||||
$git_object_path = $git_object_folder . self::GIT_URL_SPLIT .
|
||||
$git_object_file;
|
||||
$git_object_url = $git_base_url . self::GIT_URL_OBJECT .
|
||||
$git_object_path;
|
||||
return $git_object_url;
|
||||
}
|
||||
/**
|
||||
* Makes the cURL call to get the contents
|
||||
*
|
||||
* @param string $git_url url to dowmload the contents
|
||||
* @return string $git_content actual content of the git url
|
||||
*/
|
||||
public static function getGitData($git_url)
|
||||
{
|
||||
$ch = curl_init();
|
||||
$timeout = self::CURL_TIMEOUT;
|
||||
curl_setopt($ch, CURLOPT_URL, $git_url);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, self::CURL_TRANSFER);
|
||||
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
|
||||
$git_content = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
return $git_content;
|
||||
}
|
||||
}
|
803
src/library/FetchUrl.php
Normal file
803
src/library/FetchUrl.php
Normal file
|
@ -0,0 +1,803 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library\CrawlConstants;
|
||||
use seekquarry\yioop\library\UrlParser;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
*
|
||||
* Code used to manage HTTP or Gopher requests from one or more URLS
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class FetchUrl implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Make multi_curl requests for an array of sites with urls or onion urls
|
||||
*
|
||||
* @param array $sites an array containing urls of pages to request
|
||||
* @param bool $timer flag, true means print timing statistics to log
|
||||
* @param int $page_range_request maximum number of bytes to download/page
|
||||
* 0 means download all
|
||||
* @param string $temp_dir folder to store temporary ip header info
|
||||
* @param string $key the component of $sites[$i] that has the value of
|
||||
* a url to get defaults to URL
|
||||
* @param string $value component of $sites[$i] in which to store the
|
||||
* page that was gotten
|
||||
* @param bool $minimal if true do a faster request of pages by not
|
||||
* doing things like extract HTTP headers sent, etcs
|
||||
* @param array $post_data data to be POST'd to each site
|
||||
* @param bool $follow whether to follow redirects or not
|
||||
* @param string $tor_proxy url of a proxy that knows how to download
|
||||
* .onion urls
|
||||
* @param array $proxy_servers if not [], then an array of proxy
|
||||
* server to use rather than to directly download web pages from
|
||||
* the current machine
|
||||
*
|
||||
* @return array an updated array with the contents of those pages
|
||||
*/
|
||||
public static function getPages($sites, $timer = false,
|
||||
$page_range_request = C\PAGE_RANGE_REQUEST, $temp_dir = null,
|
||||
$key=CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal=false,
|
||||
$post_data = null, $follow = false, $tor_proxy = "",
|
||||
$proxy_servers=[])
|
||||
{
|
||||
static $agent_handler = null;
|
||||
static $handler_time = 0;
|
||||
if (empty($agent_handler)) {
|
||||
/* try to keep handler around between calls to allow for connection
|
||||
reuse
|
||||
*/
|
||||
$agent_handler = curl_multi_init();
|
||||
$handler_time = microtime(true);
|
||||
}
|
||||
$active = null;
|
||||
$start_time = microtime(true);
|
||||
if (!$minimal && $temp_dir == null) {
|
||||
$temp_dir = C\CRAWL_DIR."/temp";
|
||||
if (!file_exists($temp_dir)) {
|
||||
mkdir($temp_dir);
|
||||
}
|
||||
}
|
||||
//Set-up requests
|
||||
$num_sites = count($sites);
|
||||
for ($i = 0; $i < $num_sites; $i++) {
|
||||
$is_gopher = false;
|
||||
$sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
|
||||
if (isset($sites[$i][$key])) {
|
||||
list($sites[$i][$key], $url, $headers) =
|
||||
self::prepareUrlHeaders($sites[$i][$key], $minimal,
|
||||
$proxy_servers);
|
||||
if ($headers == "gopher") {
|
||||
$is_gopher = true;
|
||||
$sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
|
||||
$headers = [];
|
||||
}
|
||||
$sites[$i][0] = curl_init();
|
||||
if (!$minimal) {
|
||||
$ip_holder[$i] = fopen("$temp_dir/tmp$i.txt", 'w+');
|
||||
curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
|
||||
curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
|
||||
}
|
||||
curl_setopt($sites[$i][0], CURLOPT_USERAGENT, C\USER_AGENT);
|
||||
curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE,
|
||||
CURL_IPRESOLVE_WHATEVER);
|
||||
curl_setopt($sites[$i][0], CURLOPT_URL, $url);
|
||||
if (strcmp(substr($url,-10), "robots.txt") == 0 ) {
|
||||
$sites[$i]['ROBOT'] = true;
|
||||
$follow = true; /*wikipedia redirects their robot page. grr
|
||||
want to force this for robots pages
|
||||
*/
|
||||
}
|
||||
curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
|
||||
curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
|
||||
curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
|
||||
curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT,
|
||||
C\PAGE_TIMEOUT);
|
||||
curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, C\PAGE_TIMEOUT);
|
||||
if (stripos($url,'.onion') !== false && $tor_proxy != "") {
|
||||
curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
|
||||
//CURLPROXY_SOCKS5_HOSTNAME = 7
|
||||
curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
|
||||
if ($timer) {
|
||||
crawlLog("Using Tor proxy for $url..");
|
||||
}
|
||||
} else if ($proxy_servers != [] && !$is_gopher) {
|
||||
$select_proxy = rand(0, count($proxy_servers) - 1);
|
||||
$proxy_server = $proxy_servers[$select_proxy];
|
||||
$proxy_parts = explode(":", $proxy_server);
|
||||
$proxy_ip = $proxy_parts[0];
|
||||
if (!isset($proxy_parts[2]) ||
|
||||
strtolower($proxy_parts[2]) == 'http') {
|
||||
$proxy_type = CURLPROXY_HTTP;
|
||||
} else if (strtolower($proxy_parts[2]) == 'socks5') {
|
||||
$proxy_type = CURLPROXY_SOCKS5;
|
||||
} else {
|
||||
$proxy_type = $proxy_parts[2];
|
||||
}
|
||||
if (isset($proxy_parts[1])) {
|
||||
$proxy_port = $proxy_parts[1];
|
||||
} else {
|
||||
$proxy_port = "80";
|
||||
}
|
||||
curl_setopt($sites[$i][0], CURLOPT_PROXY,
|
||||
"$proxy_ip:$proxy_port");
|
||||
curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE,
|
||||
$proxy_type);
|
||||
if ($timer) {
|
||||
crawlLog("Selecting proxy $select_proxy for $url");
|
||||
}
|
||||
}
|
||||
if (!$minimal) {
|
||||
curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
|
||||
}
|
||||
//make lighttpd happier
|
||||
if (!$is_gopher) {
|
||||
curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER,
|
||||
$headers);
|
||||
}
|
||||
curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
|
||||
// ^ need to set for sites like att that use gzip
|
||||
if ($page_range_request > 0 && empty(
|
||||
$sites[$i][CrawlConstants::NO_RANGE])) {
|
||||
curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-".
|
||||
$page_range_request);
|
||||
} else if (!empty( $sites[$i][CrawlConstants::NO_RANGE])) {
|
||||
crawlLog("No range used for $url");
|
||||
}
|
||||
if ($post_data != null) {
|
||||
curl_setopt($sites[$i][0], CURLOPT_POST, true);
|
||||
curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS,
|
||||
$post_data[$i]);
|
||||
}
|
||||
curl_multi_add_handle($agent_handler, $sites[$i][0]);
|
||||
}
|
||||
}
|
||||
if ($timer) {
|
||||
crawlLog(" Init Get Pages ".(changeInMicrotime($start_time)));
|
||||
}
|
||||
$start_time = microtime(true);
|
||||
$start = time();
|
||||
//Wait for responses
|
||||
$running = null;
|
||||
$memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
|
||||
$mrc_check = CURLM_CALL_MULTI_PERFORM;
|
||||
restore_error_handler();
|
||||
do {
|
||||
$mrc = @curl_multi_exec($agent_handler, $running);
|
||||
/* 0.05 is to prevent this from being too busy a loop sucking
|
||||
up CPU cycle. We check every 0.05 if another page is ready of
|
||||
not*/
|
||||
if ($mrc != CURLM_CALL_MULTI_PERFORM) {
|
||||
$mrc_check = CURLM_OK;
|
||||
$ready = curl_multi_select($agent_handler, 0.05);
|
||||
}
|
||||
} while (memory_get_usage() < $memory_limit && $mrc == $mrc_check &&
|
||||
time() - $start < C\PAGE_TIMEOUT && $running > 0);
|
||||
set_error_handler(C\NS_LIB . "yioop_error_handler");
|
||||
if (time() - $start > C\PAGE_TIMEOUT && $timer) {
|
||||
crawlLog(" TIMED OUT!!!");
|
||||
}
|
||||
if ($timer) {
|
||||
crawlLog(" Page Request time ".(changeInMicrotime($start_time)));
|
||||
}
|
||||
$start_time = microtime(true);
|
||||
//Process returned pages
|
||||
for ($i = 0; $i < $num_sites; $i++) {
|
||||
if ($timer) {
|
||||
crawlTimeoutLog("FetchUrl initial processing of page %s of %s",
|
||||
$i, $num_sites);
|
||||
}
|
||||
if (!$minimal && isset($ip_holder[$i]) ) {
|
||||
rewind($ip_holder[$i]);
|
||||
$header = fread($ip_holder[$i], 8192);
|
||||
$ip_addresses = self::getCurlIp($header);
|
||||
fclose($ip_holder[$i]);
|
||||
}
|
||||
$is_gopher = false;
|
||||
if (!empty($sites[$i][0])) {
|
||||
// Get Data and Message Code
|
||||
$content = @curl_multi_getcontent($sites[$i][0]);
|
||||
$is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
|
||||
/*
|
||||
If the Transfer-encoding was chunked then the Range header
|
||||
we sent was ignored. So we manually truncate the data
|
||||
here
|
||||
*/
|
||||
if ($page_range_request > 0) {
|
||||
$init_len = strlen($content);
|
||||
$content = substr($content, 0, $page_range_request);
|
||||
if (strlen($content) != $init_len) {
|
||||
$sites[$i][CrawlConstants::CONTENT_SIZE] = $init_len;
|
||||
}
|
||||
}
|
||||
if (isset($content) && !$minimal && !$is_gopher) {
|
||||
$site = self::parseHeaderPage($content, $value);
|
||||
$sites[$i] = array_merge($sites[$i], $site);
|
||||
if (isset($header)) {
|
||||
$header = substr($header, 0,
|
||||
strpos($header, "\x0D\x0A\x0D\x0A") + 4);
|
||||
} else {
|
||||
$header = "";
|
||||
}
|
||||
$sites[$i][CrawlConstants::HEADER] =
|
||||
$header . $sites[$i][CrawlConstants::HEADER];
|
||||
unset($header);
|
||||
} else if (isset($content) && !$minimal && $is_gopher) {
|
||||
$sites[$i][CrawlConstants::HEADER] =
|
||||
$header;
|
||||
$sites[$i][$value] = $content;
|
||||
unset($header);
|
||||
} else {
|
||||
$sites[$i][$value] = $content;
|
||||
}
|
||||
if (!$minimal) {
|
||||
$sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0],
|
||||
CURLINFO_SIZE_DOWNLOAD);
|
||||
$sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0],
|
||||
CURLINFO_NAMELOOKUP_TIME);
|
||||
$sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0],
|
||||
CURLINFO_TOTAL_TIME);
|
||||
$sites[$i][self::HTTP_CODE] =
|
||||
curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
|
||||
if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
|
||||
$sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
|
||||
} else if ($is_gopher) {
|
||||
$sites[$i][self::HTTP_CODE] = 200;
|
||||
}
|
||||
if ($ip_addresses) {
|
||||
$sites[$i][self::IP_ADDRESSES] = $ip_addresses;
|
||||
} else {
|
||||
$sites[$i][self::IP_ADDRESSES] = ["0.0.0.0"];
|
||||
}
|
||||
//Get Time, Mime type and Character encoding
|
||||
$sites[$i][self::TIMESTAMP] = time();
|
||||
if ($is_gopher) {
|
||||
$path = UrlParser::getPath($sites[$i][self::URL]);
|
||||
$filename =
|
||||
UrlParser::getDocumentFilename(
|
||||
$sites[$i][self::URL]);
|
||||
if (isset($path[1])) {
|
||||
$gopher_type = $path[1];
|
||||
} else {
|
||||
$gopher_type = 1;
|
||||
}
|
||||
if ($gopher_type == 1) {
|
||||
$sites[$i][self::TYPE] = "text/gopher";
|
||||
} else if (in_array($gopher_type,
|
||||
[0, 3, 6])) {
|
||||
$sites[$i][self::TYPE] = "text/plain";
|
||||
if ($gopher_type == 6) {
|
||||
$sites[$i][$value] = convert_uudecode(
|
||||
$content);
|
||||
}
|
||||
} else if ($gopher_type == 'h') {
|
||||
$sites[$i][self::TYPE] = "text/html";
|
||||
} else if ($gopher_type == 'g') {
|
||||
$sites[$i][self::TYPE] = "image/gif";
|
||||
}
|
||||
$path_info = pathinfo($filename);
|
||||
if (!isset($sites[$i][self::TYPE]) &&
|
||||
isset($path_info['extension'])) {
|
||||
$sites[$i][self::TYPE] =
|
||||
UrlParser::guessMimeTypeFromFileName($filename);
|
||||
} else if (!isset($sites[$i][self::TYPE])) {
|
||||
$sites[$i][self::TYPE] = "unknown";
|
||||
}
|
||||
} else {
|
||||
$type_parts =
|
||||
explode(";", curl_getinfo($sites[$i][0],
|
||||
CURLINFO_CONTENT_TYPE));
|
||||
$sites[$i][self::TYPE] =
|
||||
strtolower(trim($type_parts[0]));
|
||||
}
|
||||
}
|
||||
/*
|
||||
Ideally should have line for all requests
|
||||
However, this seems to cause curl to sometimes crash
|
||||
by trying to free stuff twice on some linux systems
|
||||
at crawl time. Not having it on other systems causes crashes
|
||||
at query time
|
||||
*/
|
||||
if ($minimal || !stristr(PHP_OS, "LINUX")) {
|
||||
curl_multi_remove_handle($agent_handler, $sites[$i][0]);
|
||||
}
|
||||
curl_close($sites[$i][0]);
|
||||
if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) {
|
||||
if (isset($sites[$i][self::TYPE]) &&
|
||||
$sites[$i][self::TYPE] != "text/plain" &&
|
||||
isset($sites[$i][CrawlConstants::LOCATION]) &&
|
||||
count($site[CrawlConstants::LOCATION]) > 0) {
|
||||
$sites[$i][self::TYPE] = "text/plain";
|
||||
$sites[$i][self::HTTP_CODE] = "200";
|
||||
$tmp = wordwrap($sites[$i][$value], 80);
|
||||
$tmp_parts = explode("\n", $tmp);
|
||||
$tmp = "# Suspect server misconfiguration\n";
|
||||
$tmp .= "# Assume shouldn't crawl this site.\n";
|
||||
$tmp .= "# Pretending got following robots.txt.\n";
|
||||
$tmp .= "User-agent: *\n";
|
||||
$tmp .= "Disallow: /\n";
|
||||
$tmp .= "# Original error code: ".
|
||||
$sites[$i][self::HTTP_CODE]."\n";
|
||||
$tmp .= "# Original content:\n";
|
||||
foreach ($tmp_parts as $part) {
|
||||
$tmp = "#".$part."\n";
|
||||
}
|
||||
$sites[$i][$value] = $tmp;
|
||||
$sites[$i][self::HTTP_CODE] = "200";
|
||||
unset($site[CrawlConstants::LOCATION]);
|
||||
}
|
||||
}
|
||||
} //end big if
|
||||
} //end for
|
||||
if ($timer) {
|
||||
crawlLog(" Get Page Content time ".
|
||||
(changeInMicrotime($start_time)));
|
||||
}
|
||||
if (microtime(true) - $handler_time > C\PAGE_TIMEOUT) {
|
||||
if (!empty($agent_handler)) {
|
||||
curl_multi_close($agent_handler);
|
||||
}
|
||||
$agent_handler = null;
|
||||
}
|
||||
return $sites;
|
||||
}
|
||||
/**
|
||||
* Curl requests are typically done using cache data which is stored
|
||||
* after ### at the end of urls if this is possible. To make this
|
||||
* work. The http Host: with the url is added a header after the
|
||||
* for the curl request. The job of this function is to do this replace
|
||||
* @param string $url site to download with ip address at end potentially
|
||||
* afte ###
|
||||
* @param bool $minimal don't try to do replacement, but do add an Expect
|
||||
* header
|
||||
* @param array $proxy_servers if not empty an array of proxy servers
|
||||
* used to crawl through
|
||||
* @return array 3-tuple (orig url, url with replacement, http header array)
|
||||
*/
|
||||
public static function prepareUrlHeaders($url, $minimal = false,
|
||||
$proxy_servers = [])
|
||||
{
|
||||
$url = str_replace("&", "&", $url);
|
||||
$is_gopher = false;
|
||||
if (substr($url, 0, 6) == "gopher") {
|
||||
$is_gopher = true;
|
||||
}
|
||||
/*Check if an ETag was added by the queue server. If found, create
|
||||
If-None_Match header with the ETag and add it to the headers. Remove
|
||||
ETag from URL
|
||||
*/
|
||||
$if_none_match = "If-None-Match";
|
||||
$etag = null;
|
||||
if (C\USE_ETAG_EXPIRES && stristr($url, "ETag:")) {
|
||||
$etag_parts = preg_split("/ETag\:/i", $url);
|
||||
$etag_data = explode(" ", $etag_parts[1]);
|
||||
$etag = $etag_data[1];
|
||||
$pos = strrpos($url, "ETag:");
|
||||
$url = substr_replace($url, "", $pos, strlen("ETag: ".$etag));
|
||||
}
|
||||
/* in queue_server we added the ip (if available)
|
||||
after the url followed by ###
|
||||
*/
|
||||
$headers = [];
|
||||
if (!$minimal) {
|
||||
$url_ip_parts = explode("###", $url);
|
||||
if ($proxy_servers != [] || (isset($url_ip_parts[0]) &&
|
||||
(stripos($url_ip_parts[0],'.onion') !== false)) ) {
|
||||
$url_ip_parts = [$url_ip_parts[0]];
|
||||
$url = $url_ip_parts[0];
|
||||
}
|
||||
if (count($url_ip_parts) > 1) {
|
||||
$ip_address = ltrim(urldecode(array_pop($url_ip_parts)), "#");
|
||||
$len = strlen(inet_pton($ip_address));
|
||||
if ($len == 4 || $len == 16) {
|
||||
if ($len == 16) {
|
||||
$ip_address= "[$ip_address]";
|
||||
}
|
||||
if (count($url_ip_parts) > 1) {
|
||||
$url = implode("###", $url_ip_parts);
|
||||
} else {
|
||||
$url = $url_ip_parts[0];
|
||||
}
|
||||
$url_parts = @parse_url($url);
|
||||
if (isset($url_parts['host'])) {
|
||||
$cnt = 1;
|
||||
$url_with_ip_if_possible =
|
||||
str_replace($url_parts['host'], $ip_address ,$url,
|
||||
$cnt);
|
||||
if ($cnt != 1) {
|
||||
$url_with_ip_if_possible = $url;
|
||||
} else {
|
||||
$headers[] = "Host:".$url_parts['host'];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$url_with_ip_if_possible = $url;
|
||||
}
|
||||
} else {
|
||||
$url_with_ip_if_possible = $url;
|
||||
}
|
||||
} else {
|
||||
$url_with_ip_if_possible = $url;
|
||||
}
|
||||
$headers[] = 'Expect:';
|
||||
if (C\USE_ETAG_EXPIRES && $etag !== null) {
|
||||
$etag_header = $if_none_match.": ".$etag;
|
||||
$headers[] = $etag_header;
|
||||
}
|
||||
if ($is_gopher) {
|
||||
$headers = "gopher";
|
||||
}
|
||||
$results = [$url, $url_with_ip_if_possible, $headers];
|
||||
return $results;
|
||||
}
|
||||
/**
|
||||
* Computes a hash of a string containing page data for use in
|
||||
* deduplication of pages with similar content
|
||||
*
|
||||
* @param string& $page reference to web page data
|
||||
* @return string 8 byte hash to identify page contents
|
||||
*/
|
||||
public static function computePageHash(&$page)
|
||||
{
|
||||
/* to do dedup we strip script, noscript, and style tags
|
||||
as well as their content, then we strip tags, get rid
|
||||
of whitespace and hash
|
||||
*/
|
||||
$strip_array =
|
||||
['@<script[^>]*?>.*?</script>@si',
|
||||
'@<noscript[^>]*?>.*?</noscript>@si',
|
||||
'@<style[^>]*?>.*?</style>@si'];
|
||||
$dedup_string = preg_replace(
|
||||
$strip_array, '', $page);
|
||||
$dedup_string_old = preg_replace(
|
||||
'/\W+/', '', $dedup_string);
|
||||
$dedup_string = strip_tags($dedup_string_old);
|
||||
if ($dedup_string == "") {
|
||||
$dedup_string = $dedup_string_old;
|
||||
}
|
||||
$dedup_string = preg_replace(
|
||||
'/\W+/', '', $dedup_string);
|
||||
return crawlHash($dedup_string, true);
|
||||
}
|
||||
/**
|
||||
* Splits an http response document into the http headers sent
|
||||
* and the web page returned. Parses out useful information from
|
||||
* the header and return an array of these two parts and the useful info.
|
||||
*
|
||||
* @param string $header_and_page string of downloaded data
|
||||
* @param string $value field to store the page portion of page
|
||||
* @return array info array consisting of a header, page for an http
|
||||
* response, as well as parsed from the header the server, server
|
||||
* version, operating system, encoding, and date information.
|
||||
*/
|
||||
public static function parseHeaderPage($header_and_page,
|
||||
$value=CrawlConstants::PAGE)
|
||||
{
|
||||
$cache_page_validators = [];
|
||||
$cache_page_validators['etag'] = -1;
|
||||
$cache_page_validators['expires'] = -1;
|
||||
$new_offset = 0;
|
||||
// header will include all redirect headers
|
||||
$site = [];
|
||||
$site[CrawlConstants::LOCATION] = [];
|
||||
do {
|
||||
$continue = false;
|
||||
$CRLFCRLF = strpos($header_and_page, "\x0D\x0A\x0D\x0A",
|
||||
$new_offset);
|
||||
$LFLF = strpos($header_and_page, "\x0A\x0A", $new_offset);
|
||||
//either two CRLF (what spec says) or two LF's to be safe
|
||||
$old_offset = $new_offset;
|
||||
$header_offset = ($CRLFCRLF > 0) ? $CRLFCRLF : $LFLF;
|
||||
$header_offset = ($header_offset) ? $header_offset : 0;
|
||||
$new_offset = ($CRLFCRLF > 0) ? $header_offset + 4
|
||||
: $header_offset + 2;
|
||||
$redirect_pos = stripos($header_and_page, 'Location:', $old_offset);
|
||||
$redirect_str = "Location:";
|
||||
if ($redirect_pos === false) {
|
||||
$redirect_pos =
|
||||
stripos($header_and_page, 'Refresh:', $old_offset);
|
||||
$redirect_str = "Refresh:";
|
||||
}
|
||||
if (isset($header_and_page[$redirect_pos - 1]) &&
|
||||
ord($header_and_page[$redirect_pos - 1]) > 32) {
|
||||
$redirect_pos = $new_offset; //ignore X-XRDS-Location header
|
||||
} else if ($redirect_pos !== false && $redirect_pos < $new_offset){
|
||||
$redirect_pos += strlen($redirect_str);
|
||||
$pre_line = substr($header_and_page, $redirect_pos,
|
||||
strpos($header_and_page, "\n", $redirect_pos) -
|
||||
$redirect_pos);
|
||||
$loc = @trim($pre_line);
|
||||
if (strlen($loc) > 0) {
|
||||
$site[CrawlConstants::LOCATION][] = @$loc;
|
||||
}
|
||||
$continue = true;
|
||||
}
|
||||
} while($continue);
|
||||
if ($header_offset > 0) {
|
||||
$site[CrawlConstants::HEADER] =
|
||||
substr($header_and_page, 0, $header_offset);
|
||||
$site[$value] = ltrim(substr($header_and_page, $header_offset));
|
||||
} else { //header message no body; maybe 301?
|
||||
$site[CrawlConstants::HEADER] = $header_and_page;
|
||||
$site[$value] = " ";
|
||||
}
|
||||
$lines = explode("\n", $site[CrawlConstants::HEADER]);
|
||||
$first_line = array_shift($lines);
|
||||
$response = preg_split("/(\s+)/", $first_line);
|
||||
$site[CrawlConstants::HTTP_CODE] = isset($response[1]) ?
|
||||
@trim($response[1]) : 0;
|
||||
$site[CrawlConstants::ROBOT_METAS] = [];
|
||||
foreach ($lines as $line) {
|
||||
$line = trim($line);
|
||||
if (stristr($line, 'Server:')) {
|
||||
$server_parts = preg_split("/Server\:/i", $line);
|
||||
$server_name_parts = @explode("/", $server_parts[1]);
|
||||
$site[CrawlConstants::SERVER] = @trim($server_name_parts[0]);
|
||||
if (isset($server_name_parts[1])) {
|
||||
$version_parts = explode("(", $server_name_parts[1]);
|
||||
$site[CrawlConstants::SERVER_VERSION] =
|
||||
@trim($version_parts[0]);
|
||||
if (isset($version_parts[1])) {
|
||||
$os_parts = explode(")", $version_parts[1]);
|
||||
$site[CrawlConstants::OPERATING_SYSTEM] =
|
||||
@trim($os_parts[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (stristr($line, 'Content-type:')) {
|
||||
list(,$mimetype,) = preg_split("/:|;/i", $line);
|
||||
$site[CrawlConstants::TYPE] = trim($mimetype);
|
||||
}
|
||||
if (stristr($line, 'charset=')) {
|
||||
$line_parts = preg_split("/charset\=/i", $line);
|
||||
$site[CrawlConstants::ENCODING] =
|
||||
strtoupper(@trim($line_parts[1]));
|
||||
}
|
||||
if (stristr($line, 'Last-Modified:')) {
|
||||
$line_parts = preg_split("/Last\-Modified\:/i", $line);
|
||||
$site[CrawlConstants::MODIFIED] =
|
||||
strtotime(@trim($line_parts[1]));
|
||||
}
|
||||
if (stristr($line, 'X-Robots-Tag:')) { // robot directives pdfs etc
|
||||
$line_parts = preg_split("/X\-Robots\-Tag\:/i", $line);
|
||||
$robot_metas = explode(",", $line_parts[1]);
|
||||
foreach ($robot_metas as $robot_meta) {
|
||||
$site[CrawlConstants::ROBOT_METAS][] = strtoupper(
|
||||
trim($robot_meta));
|
||||
}
|
||||
}
|
||||
if (stristr($line, 'Content-Range:')) {
|
||||
$line_parts = explode("/", $line);
|
||||
if (!empty($line_parts[1])) {
|
||||
$content_size = intval(trim($line_parts[1]));
|
||||
if ($content_size > 0) {
|
||||
$site[CrawlConstants::CONTENT_SIZE] = $content_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
$canonical_regex = "/Link\:\s*\<\s*(http.*)\s*\>\s*\;\s*".
|
||||
"rel\s*\=\s*(\"|')?canonical(\"|')?/";
|
||||
// levenshtein gives notices on strings longer than 255
|
||||
if (preg_match($canonical_regex, $line, $matches) &&
|
||||
isset($site[CrawlConstants::URL]) && strlen($matches[1]) < 252
|
||||
&& (strlen($site[CrawlConstants::URL]) >= 255 ||
|
||||
levenshtein($matches[1], $site[CrawlConstants::URL]) > 3)) {
|
||||
// for rel canonical headers
|
||||
$site[CrawlConstants::LOCATION][] = $matches[1];
|
||||
$site[CrawlConstants::ROBOT_METAS][] = 'NOFOLLOW';
|
||||
}
|
||||
if (C\USE_ETAG_EXPIRES && stristr($line, 'ETag:')) {
|
||||
$line_parts = preg_split("/ETag\:/i", $line);
|
||||
if (isset($line_parts[1])) {
|
||||
$etag_data = explode(" ", $line_parts[1]);
|
||||
if (isset($etag_data[1])) {
|
||||
$etag = $etag_data[1];
|
||||
$cache_page_validators['etag'] = $etag;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (C\USE_ETAG_EXPIRES && stristr($line, 'Expires:')) {
|
||||
$line_parts = preg_split("/Expires\:/i", $line);
|
||||
$all_dates = $line_parts[1];
|
||||
$date_parts = explode(",", $all_dates);
|
||||
if (count($date_parts) == 2) {
|
||||
$cache_page_validators['expires'] = strtotime(
|
||||
$date_parts[1]);
|
||||
} else if (count($date_parts) > 2) {
|
||||
/*Encountered some pages with more than one Expires date
|
||||
:O */
|
||||
$timestamps = [];
|
||||
for ($i = 1;$i < count($date_parts); $i += 2) {
|
||||
$ds = strtotime($date_parts[$i]);
|
||||
$timestamps[] = $ds;
|
||||
}
|
||||
$lowest = min($timestamps);
|
||||
$cache_page_validators['expires'] = $lowest;
|
||||
}
|
||||
}
|
||||
if (C\USE_ETAG_EXPIRES && !($cache_page_validators['etag'] == -1 &&
|
||||
$cache_page_validators['expires'] == -1)) {
|
||||
$site[CrawlConstants::CACHE_PAGE_VALIDATORS] =
|
||||
$cache_page_validators;
|
||||
}
|
||||
}
|
||||
/*
|
||||
If the doc is HTML and it uses a http-equiv to set the encoding
|
||||
then we override what the server says (if anything). As we
|
||||
are going to convert to UTF-8 we remove the charset info
|
||||
from the meta tag so cached pages will display correctly and
|
||||
redirects without char encoding won't be given a different hash.
|
||||
*/
|
||||
$encoding_info = guessEncodingHtml($site[$value], true);
|
||||
if (is_array($encoding_info)) {
|
||||
list($site[CrawlConstants::ENCODING], $start_charset, $len_c) =
|
||||
$encoding_info;
|
||||
$site[$value] = substr_replace($site[$value], "", $start_charset,
|
||||
$len_c);
|
||||
} else {
|
||||
$site[CrawlConstants::ENCODING] = $encoding_info;
|
||||
}
|
||||
if (!isset($site[CrawlConstants::SERVER]) ) {
|
||||
$site[CrawlConstants::SERVER] = "unknown";
|
||||
}
|
||||
return $site;
|
||||
}
|
||||
/**
|
||||
* Computes the IP address from http get-responser header
|
||||
*
|
||||
* @param string $header contains complete transcript of HTTP get/response
|
||||
* @return string IPv4 address as a string of dot separated quads.
|
||||
*/
|
||||
public static function getCurlIp($header)
|
||||
{
|
||||
if (preg_match_all('/Trying\s+(.*)(\.\.\.)/',
|
||||
$header, $matches)) {
|
||||
$out_addresses = [];
|
||||
$addresses = array_unique($matches[1]);
|
||||
foreach ($addresses as $address) {
|
||||
$num = @inet_pton($address);
|
||||
if ($num !== false) {
|
||||
$out_addresses[] = $address;
|
||||
}
|
||||
}
|
||||
if ($out_addresses != []) {
|
||||
return $out_addresses;
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Make a curl request for the provided url
|
||||
*
|
||||
* @param string $site url of page to request
|
||||
* @param array $post_data any data to be POST'd to the URL
|
||||
* @param bool $check_for_errors whether or not to check the response
|
||||
* for the words, NOTICE, WARNING, FATAL which might indicate an
|
||||
* error on the server
|
||||
* @param string $user_password username:password to use for connection if
|
||||
* needed (optional)
|
||||
* @return string the contents of what the curl request fetched
|
||||
*/
|
||||
public static function getPage($site, $post_data = null,
|
||||
$check_for_errors = false, $user_password = null)
|
||||
{
|
||||
static $agents = [];
|
||||
$not_web_setting = (php_sapi_name() == 'cli');
|
||||
$MAX_SIZE = 50;
|
||||
$host = @parse_url($site, PHP_URL_HOST);
|
||||
if ($host !== false) {
|
||||
if (count($agents) > $MAX_SIZE) {
|
||||
$agent_host = array_shift($agents);
|
||||
if ($agent_host) {
|
||||
curl_close($agent_host);
|
||||
}
|
||||
}
|
||||
if (empty($agents[$host])) {
|
||||
$agents[$host] = curl_init();
|
||||
}
|
||||
}
|
||||
if ($not_web_setting) {
|
||||
crawlLog(" Init curl request of a single page");
|
||||
}
|
||||
curl_setopt($agents[$host], CURLOPT_USERAGENT, C\USER_AGENT);
|
||||
curl_setopt($agents[$host], CURLOPT_URL, $site);
|
||||
curl_setopt($agents[$host], CURLOPT_AUTOREFERER, true);
|
||||
curl_setopt($agents[$host], CURLOPT_FOLLOWLOCATION, true);
|
||||
// these next two lines should probably be modified for better security
|
||||
curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 0);
|
||||
curl_setopt($agents[$host], CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt($agents[$host], CURLOPT_NOSIGNAL, true);
|
||||
curl_setopt($agents[$host], CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($agents[$host], CURLOPT_FAILONERROR, true);
|
||||
curl_setopt($agents[$host], CURLOPT_TIMEOUT, C\SINGLE_PAGE_TIMEOUT);
|
||||
curl_setopt($agents[$host], CURLOPT_CONNECTTIMEOUT, C\PAGE_TIMEOUT);
|
||||
//make lighttpd happier
|
||||
curl_setopt($agents[$host], CURLOPT_HTTPHEADER, ['Expect:']);
|
||||
if ($post_data != null) {
|
||||
curl_setopt($agents[$host], CURLOPT_POST, true);
|
||||
curl_setopt($agents[$host], CURLOPT_POSTFIELDS, $post_data);
|
||||
} else {
|
||||
// since we are caching agents, need to do this so doesn't get stuck
|
||||
// as post and so query string ignored for get's
|
||||
curl_setopt($agents[$host], CURLOPT_HTTPGET, true);
|
||||
}
|
||||
if ($user_password != null) {
|
||||
curl_setopt($agents[$host], CURLOPT_FAILONERROR, false);
|
||||
curl_setopt($agents[$host], CURLOPT_USERPWD, $user_password);
|
||||
curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 2);
|
||||
curl_setopt($agents[$host], CURLOPT_SSLVERSION,
|
||||
CURL_SSLVERSION_TLSv1_2);
|
||||
}
|
||||
if ($not_web_setting) {
|
||||
crawlLog(" Set curl options for single page request");
|
||||
}
|
||||
$time = time();
|
||||
$response = curl_exec($agents[$host]);
|
||||
if (time() - $time > C\PAGE_TIMEOUT && $not_web_setting) {
|
||||
crawlLog(" Request took longer than page timeout!!");
|
||||
crawlLog(" Either could not reach URL or website took too");
|
||||
crawlLog(" long to respond.");
|
||||
}
|
||||
curl_setopt($agents[$host], CURLOPT_POSTFIELDS, "");
|
||||
if ($not_web_setting) {
|
||||
crawlLog(" Done curl exec");
|
||||
}
|
||||
if ($not_web_setting && $check_for_errors) {
|
||||
self::checkResponseForErrors($response);
|
||||
}
|
||||
return $response;
|
||||
}
|
||||
/**
|
||||
* Given the results of a getPage call, check whether or not the response
|
||||
* had the words NOTICE, WARNING, FATAL which might indicate an error on
|
||||
* the server. If it does, then the $response string is sent to the
|
||||
* crawlLog
|
||||
*
|
||||
* @param string $response getPage response in which to check for errors
|
||||
*/
|
||||
public static function checkResponseForErrors($response)
|
||||
{
|
||||
if (preg_match("/NOTICE|WARNING|FATAL/i", $response)) {
|
||||
crawlLog("There appears to have been an error in the server ".
|
||||
"response. Response was:");
|
||||
crawlLog(wordwrap($response));
|
||||
}
|
||||
}
|
||||
}
|
156
src/library/FileCache.php
Normal file
156
src/library/FileCache.php
Normal file
|
@ -0,0 +1,156 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\models\datasources as D;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Library of functions used to implement a simple file cache
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class FileCache
|
||||
{
|
||||
/**
|
||||
* File used to serve last cache request
|
||||
* @var string
|
||||
*/
|
||||
public $cache_file;
|
||||
/**
|
||||
* Folder name to use for this FileCache
|
||||
* @var string
|
||||
*/
|
||||
public $dir_name;
|
||||
/**
|
||||
* Total number of bins to cycle between
|
||||
*/
|
||||
const NUMBER_OF_BINS = 24;
|
||||
/**
|
||||
* Maximum number of files in a bin
|
||||
*/
|
||||
const MAX_FILES_IN_A_BIN = 10000;
|
||||
/**
|
||||
* Creates the directory for the file cache, sets how frequently
|
||||
* all items in the cache expire
|
||||
*
|
||||
* @param string $dir_name folder name of where to put the file cache
|
||||
*/
|
||||
public function __construct($dir_name)
|
||||
{
|
||||
$this->dir_name = $dir_name;
|
||||
|
||||
if (!is_dir($this->dir_name)) {
|
||||
mkdir($this->dir_name);
|
||||
$db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager";
|
||||
$db = new $db_class();
|
||||
$db->setWorldPermissionsRecursive($this->dir_name, true);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Retrieve data associated with a key that has been put in the cache
|
||||
*
|
||||
* @param string $key the key to look up
|
||||
* @return mixed the data associated with the key if it exists, false
|
||||
* otherwise
|
||||
*/
|
||||
public function get($key)
|
||||
{
|
||||
$checksum_block = $this->checksum($key);
|
||||
$this->cache_file = $this->dir_name . "/$checksum_block/" .
|
||||
webencode($key);
|
||||
if (file_exists($this->cache_file)) {
|
||||
return unserialize(file_get_contents($this->cache_file));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Stores in the cache a key-value pair
|
||||
*
|
||||
* Only when a key is set is there a check for whether to invalidate
|
||||
* a cache bin. It is deleted as invalid if the following two conditions
|
||||
* both hold:
|
||||
* The last time it was expired is more than SECONDS_IN_A_BIN seconds ago,
|
||||
* and the number of cache items is more than self::MAX_FILES_IN_A_BIN.
|
||||
*
|
||||
* @param string $key to associate with value
|
||||
* @param mixed $value to store
|
||||
*/
|
||||
public function set($key, $value)
|
||||
{
|
||||
$checksum_block = $this->checksum($key);
|
||||
$checksum_dir = $this->dir_name."/$checksum_block";
|
||||
if (file_exists("$checksum_dir/last_expired.txt")) {
|
||||
$data =
|
||||
unserialize(
|
||||
file_get_contents("$checksum_dir/last_expired.txt"));
|
||||
}
|
||||
if (!isset($data['last_expired'])) {
|
||||
$data = ['last_expired' => time(), 'count' => 0];
|
||||
}
|
||||
if ($data['count'] > self::MAX_FILES_IN_A_BIN) {
|
||||
$db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
|
||||
$db = new $db_class();
|
||||
$db->unlinkRecursive($checksum_dir);
|
||||
}
|
||||
if (!file_exists($checksum_dir)) {
|
||||
mkdir($checksum_dir);
|
||||
$data['last_expired'] = time(); /* currently count is used rather
|
||||
than time, but we store time anyway.
|
||||
*/
|
||||
}
|
||||
$cache_file = "$checksum_dir/".webencode($key);
|
||||
if (!file_exists($cache_file)) {
|
||||
$data['count']++;
|
||||
}
|
||||
file_put_contents("$checksum_dir/last_expired.txt",
|
||||
serialize($data));
|
||||
file_put_contents($cache_file, serialize($value));
|
||||
}
|
||||
/**
|
||||
* Makes a 0 - self::NUMBER_OF_BINS value out of the provided key
|
||||
*
|
||||
* @param string $key to convert to a random value between
|
||||
* 0 - self::NUMBER_OF_BINS
|
||||
* @return int value between 0 and self::NUMBER_OF_BINS
|
||||
*/
|
||||
public function checksum($key)
|
||||
{
|
||||
$len = strlen($key);
|
||||
$value = 0;
|
||||
for ($i = 0; $i < $len; $i++) {
|
||||
$value += ord($key[$i]);
|
||||
}
|
||||
return ($value % self::NUMBER_OF_BINS);
|
||||
}
|
||||
}
|
294
src/library/HashTable.php
Normal file
294
src/library/HashTable.php
Normal file
|
@ -0,0 +1,294 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
*
|
||||
* Code used to manage a memory efficient hash table
|
||||
* Weights for the queue must be flaots
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class HashTable extends StringArray
|
||||
{
|
||||
/**
|
||||
* The size in bytes for keys stored in the hash table
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
public $key_size;
|
||||
/**
|
||||
* The size in bytes of values associated with values
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
public $value_size;
|
||||
/**
|
||||
* Holds an all \0 string used of length $this->key_size
|
||||
* @var string
|
||||
*/
|
||||
public $null;
|
||||
/**
|
||||
* Holds \0\0 followed by an all \FF string of length $this->key_size -1
|
||||
* Used to indicate that a slot once held data but that data was deleted.
|
||||
* Such a slot tells a lookup to keep going, but on an insert can be
|
||||
* overwritten in the inserted key is not already in the table
|
||||
* @var string
|
||||
*/
|
||||
public $deleted;
|
||||
/**
|
||||
* Number of items currently in the hash table
|
||||
* @var int
|
||||
*/
|
||||
public $count;
|
||||
/**
|
||||
* Flag for hash table lookup methods
|
||||
*/
|
||||
const ALWAYS_RETURN_PROBE = 1;
|
||||
/**
|
||||
* Flag for hash table lookup methods
|
||||
*/
|
||||
const RETURN_PROBE_ON_KEY_FOUND = 0;
|
||||
/**
|
||||
* Flag for hash table lookup methods
|
||||
*/
|
||||
const RETURN_VALUE = -1;
|
||||
/**
|
||||
* Flag for hash table lookup methods
|
||||
*/
|
||||
const RETURN_BOTH = -2;
|
||||
/**
|
||||
* Makes a persistently stored (i.e., on disk and ram) hash table using the
|
||||
* supplied parameters
|
||||
*
|
||||
* @param string $fname filename to use when storing the hash table to disk
|
||||
* @param int $num_values number of key value pairs the table can hold
|
||||
* @param int $key_size number of bytes to store a hash table key
|
||||
* @param int $value_size number of bytes to store a hash table value
|
||||
* @param int $save_frequency how many non read operation before saving to
|
||||
* disk
|
||||
*/
|
||||
public function __construct($fname, $num_values, $key_size, $value_size,
|
||||
$save_frequency = self::DEFAULT_SAVE_FREQUENCY)
|
||||
{
|
||||
$this->key_size = $key_size;
|
||||
$this->value_size = $value_size;
|
||||
$this->null = pack("x". $this->key_size);
|
||||
$this->deleted = pack("H2x".($this->key_size - 1), "FF");
|
||||
$this->count = 0;
|
||||
parent::__construct($fname, $num_values,
|
||||
$key_size + $value_size, $save_frequency);
|
||||
}
|
||||
/**
|
||||
* Inserts the provided $key - $value pair into the hash table
|
||||
*
|
||||
* @param string $key the key to use for the insert (will be needed for
|
||||
* lookup)
|
||||
* @param string $value the value associated with $key
|
||||
* @param int $probe if the location in the hash table is already known
|
||||
* to be $probe then this variable can be used to save a lookup
|
||||
* @return bool whether the insert was successful or not
|
||||
*/
|
||||
public function insert($key, $value, $probe = false)
|
||||
{
|
||||
$null = $this->null;
|
||||
$deleted = $this->deleted;
|
||||
|
||||
if ($probe === false) {
|
||||
$probe = $this->lookup($key, self::ALWAYS_RETURN_PROBE);
|
||||
}
|
||||
if ($probe === false) {
|
||||
/* this is a little slow
|
||||
the idea is we can't use deleted slots until we are sure
|
||||
$key isn't in the table
|
||||
*/
|
||||
$probe = $this->lookupArray(
|
||||
$key, [$null, $deleted], self::ALWAYS_RETURN_PROBE);
|
||||
if ($probe === false) {
|
||||
crawlLog("No space in hash table");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
//there was a free slot so write entry...
|
||||
$data = pack("x". ($this->key_size + $this->value_size));
|
||||
if (strlen($value) < $this->value_size) {
|
||||
/* this case should not happen, rather
|
||||
give an error we null terminate the string to the desired
|
||||
length
|
||||
*/
|
||||
$value = str_pad($value, $this->value_size, '\0');
|
||||
}
|
||||
//first the key
|
||||
for ($i = 0; $i < $this->key_size; $i++) {
|
||||
$data[$i] = $key[$i];
|
||||
}
|
||||
//then the value
|
||||
for ($i = 0; $i < $this->value_size; $i++) {
|
||||
$data[$i + $this->key_size] = $value[$i];
|
||||
}
|
||||
$this->put($probe, $data);
|
||||
$this->count++;
|
||||
$this->checkSave();
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Tries to lookup the key in the hash table either return the
|
||||
* location where it was found or the value associated with the key.
|
||||
*
|
||||
* @param string $key key to look up in the hash table
|
||||
* @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE,
|
||||
* self::RETURN_PROBE_ON_KEY_FOUND, self::RETURN_VALUE, or self::BOTH.
|
||||
* Here value means the value associated with the key and probe is
|
||||
* either the location in the array where the key was found or
|
||||
* the first location in the array where it was determined the
|
||||
* key could not be found.
|
||||
* @return mixed would be string if the value is being returned,
|
||||
* an int if the probe is being returned, and false if the key
|
||||
* is not found
|
||||
*/
|
||||
public function lookup($key, $return_probe_value = self::RETURN_VALUE)
|
||||
{
|
||||
return $this->lookupArray(
|
||||
$key, [$this->null], $return_probe_value);
|
||||
}
|
||||
/**
|
||||
* Tries to lookup the key in the hash table either return the
|
||||
* location where it was found or the value associated with the key.
|
||||
* If the key is not at the initial probe value, linear search in the
|
||||
* table is done. The values which cut-off the search are stored in
|
||||
* $null_array. Using an array allows for flexibility since a deleted
|
||||
* entry needs to be handled different when doing a lookup then when
|
||||
* doing an insert.
|
||||
*
|
||||
* @param string $key key to look up in the hash table
|
||||
* @param array $null_array key values that would cut-off the search
|
||||
* for key if the initial probe failed
|
||||
* @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE,
|
||||
* self::RETURN_PROBE_ON_KEY_FOUND, or self::RETURN_VALUE. Here
|
||||
* value means the value associated with the key and probe is
|
||||
* either the location in the array where the key was found or
|
||||
* the first location in the array where it was determined the
|
||||
* key could not be found.
|
||||
* @return mixed would be string if the value is being returned,
|
||||
* an int if the probe is being returned, and false if the key
|
||||
* is not found
|
||||
*/
|
||||
public function lookupArray($key, $null_array,
|
||||
$return_probe_value = self::RETURN_VALUE)
|
||||
{
|
||||
$index = $this->hash($key);
|
||||
$num_values = $this->num_values;
|
||||
$probe_array = [self::RETURN_PROBE_ON_KEY_FOUND,
|
||||
self::ALWAYS_RETURN_PROBE];
|
||||
for ($j = 0; $j < $num_values; $j++) {
|
||||
$probe = ($index + $j) % $num_values;
|
||||
list($index_key, $index_value) = $this->getEntry($probe);
|
||||
if (in_array($index_key, $null_array)) {
|
||||
if ($return_probe_value == self::ALWAYS_RETURN_PROBE) {
|
||||
return $probe;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (strcmp($key, $index_key) == 0) { break; }
|
||||
}
|
||||
|
||||
if ($j == $num_values) {return false;}
|
||||
|
||||
$result = $index_value;
|
||||
if (in_array($return_probe_value, $probe_array)) {
|
||||
$result = $probe;
|
||||
}
|
||||
if ($return_probe_value == self::RETURN_BOTH) {
|
||||
$result = [$probe, $index_value];
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
/**
|
||||
* Deletes the data associated with the provided key from the hash table
|
||||
*
|
||||
* @param string $key the key to delete the entry for
|
||||
* @param int $probe if the location in the hash table is already known
|
||||
* to be $probe then this variable can be used to save a lookup
|
||||
* @return bool whether or not something was deleted
|
||||
*/
|
||||
public function delete($key, $probe = false)
|
||||
{
|
||||
$deleted = pack("H2x".($this->key_size + $this->value_size - 1), "FF");
|
||||
//deletes
|
||||
if ($probe === false) {
|
||||
$probe = $this->lookup($key, self::RETURN_PROBE_ON_KEY_FOUND);
|
||||
}
|
||||
if ($probe === false) { return false; }
|
||||
$this->put($probe, $deleted);
|
||||
$this->count--;
|
||||
$this->checkSave();
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Get the ith entry of the array for the hash table (no hashing here)
|
||||
*
|
||||
* @param int $i an index of the hash table array
|
||||
* @return array the key value pair stored at this index
|
||||
*/
|
||||
public function getEntry($i)
|
||||
{
|
||||
$raw = $this->get($i);
|
||||
$key = substr($raw, 0, $this->key_size);
|
||||
$value = substr($raw, $this->key_size, $this->value_size);
|
||||
return [$key, $value];
|
||||
}
|
||||
/**
|
||||
* Hashes the provided key to an index in the array of the hash table
|
||||
*
|
||||
* @param string $key a key to hashed into the hash table
|
||||
* @return int an index in the array of the hash table
|
||||
*/
|
||||
public function hash($key)
|
||||
{
|
||||
$tmp = md5($key, true);
|
||||
$pre_index = ((ord($tmp[0]) << 8) + ord($tmp[1]) << 8) + ord($tmp[2]);
|
||||
$index = floor($pre_index * $this->num_values/(2 << 23));
|
||||
return $index;
|
||||
}
|
||||
/**
|
||||
* Pretty prints the contents of the hash table viewed as an array.
|
||||
*
|
||||
*/
|
||||
public function printContents()
|
||||
{
|
||||
for ($i = 1; $i <= $this->num_values; $i++) {
|
||||
$row = $this->getEntry($i);
|
||||
print "Entry: $i Key:".$row[0]." Value: ".$row[1]."\n";
|
||||
}
|
||||
}
|
||||
}
|
473
src/library/IndexArchiveBundle.php
Normal file
473
src/library/IndexArchiveBundle.php
Normal file
|
@ -0,0 +1,473 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/**
|
||||
* Used for crawlLog and crawlHash
|
||||
*/
|
||||
require_once __DIR__.'/Utility.php';
|
||||
/**
|
||||
* Encapsulates a set of web page summaries and an inverted word-index of terms
|
||||
* from these summaries which allow one to search for summaries containing a
|
||||
* particular word.
|
||||
*
|
||||
* The basic file structures for an IndexArchiveBundle are:
|
||||
* <ol>
|
||||
* <li>A WebArchiveBundle for web page summaries.</li>
|
||||
* <li>A IndexDictionary containing all the words stored in the bundle.
|
||||
* Each word entry in the dictionary contains starting and ending
|
||||
* offsets for documents containing that word for some particular IndexShard
|
||||
* generation.</li>
|
||||
* <li>A set of index shard generations. These generations
|
||||
* have names index0, index1,... A shard has word entries, word doc entries
|
||||
* and document entries. For more information see the index shard
|
||||
* documentation.
|
||||
* </li>
|
||||
* <li>
|
||||
* The file generations.txt keeps track of what is the current generation.
|
||||
* A given generation can hold NUM_WORDS_PER_GENERATION words amongst all
|
||||
* its partitions. After which the next generation begins.
|
||||
* </li>
|
||||
* </ol>
|
||||
*
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class IndexArchiveBundle implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Folder name to use for this IndexArchiveBundle
|
||||
* @var string
|
||||
*/
|
||||
public $dir_name;
|
||||
/**
|
||||
* A short text name for this IndexArchiveBundle
|
||||
* @var string
|
||||
*/
|
||||
public $description;
|
||||
/**
|
||||
* Number of partitions in the summaries WebArchiveBundle
|
||||
* @var int
|
||||
*/
|
||||
public $num_partitions_summaries;
|
||||
/**
|
||||
* structure contains info about the current generation:
|
||||
* its index (ACTIVE), and the number of words it contains
|
||||
* (NUM_WORDS).
|
||||
* @var array
|
||||
*/
|
||||
public $generation_info;
|
||||
/**
|
||||
* Number of docs before a new generation is started
|
||||
* @var int
|
||||
*/
|
||||
public $num_docs_per_generation;
|
||||
/**
|
||||
* WebArchiveBundle for web page summaries
|
||||
* @var object
|
||||
*/
|
||||
public $summaries;
|
||||
/**
|
||||
* IndexDictionary for all shards in the IndexArchiveBundle
|
||||
* This contains entries of the form (word, num_shards with word,
|
||||
* posting list info 0th shard containing the word,
|
||||
* posting list info 1st shard containing the word, ...)
|
||||
* @var object
|
||||
*/
|
||||
public $dictionary;
|
||||
/**
|
||||
* Index Shard for current generation inverted word index
|
||||
* @var object
|
||||
*/
|
||||
public $current_shard;
|
||||
/**
|
||||
* What version of index archive bundle this is
|
||||
* @var int
|
||||
*/
|
||||
public $version;
|
||||
/**
|
||||
* Threshold hold beyond which we don't load old index shard when
|
||||
* restarting and instead just advance to a new shard
|
||||
*/
|
||||
const NO_LOAD_SIZE = 50000000;
|
||||
/**
|
||||
* Makes or initializes an IndexArchiveBundle with the provided parameters
|
||||
*
|
||||
* @param string $dir_name folder name to store this bundle
|
||||
* @param bool $read_only_archive whether to open archive only for reading
|
||||
* or reading and writing
|
||||
* @param string $description a text name/serialized info about this
|
||||
* IndexArchiveBundle
|
||||
* @param int $num_docs_per_generation the number of pages to be stored
|
||||
* in a single shard
|
||||
*/
|
||||
public function __construct($dir_name, $read_only_archive = true,
|
||||
$description = null, $num_docs_per_generation =
|
||||
C\NUM_DOCS_PER_GENERATION)
|
||||
{
|
||||
$this->dir_name = $dir_name;
|
||||
$index_archive_exists = false;
|
||||
$is_dir = is_dir($this->dir_name);
|
||||
if (!$is_dir && !$read_only_archive) {
|
||||
mkdir($this->dir_name);
|
||||
mkdir($this->dir_name."/posting_doc_shards");
|
||||
} else if (!$is_dir) {
|
||||
return false;
|
||||
} else {
|
||||
$index_archive_exists = true;
|
||||
}
|
||||
if (file_exists($this->dir_name."/generation.txt")) {
|
||||
$this->generation_info = unserialize(
|
||||
file_get_contents($this->dir_name."/generation.txt"));
|
||||
} else if (!$read_only_archive) {
|
||||
$this->generation_info['ACTIVE'] = 0;
|
||||
file_put_contents($this->dir_name."/generation.txt",
|
||||
serialize($this->generation_info));
|
||||
}
|
||||
$this->summaries = new WebArchiveBundle($dir_name."/summaries",
|
||||
$read_only_archive, -1, $description);
|
||||
if (!$read_only_archive) {
|
||||
$this->summaries->initCountIfNotExists("VISITED_URLS_COUNT");
|
||||
}
|
||||
$this->description = $this->summaries->description;
|
||||
if (isset($this->summaries->version)) {
|
||||
$this->version = $this->summaries->version;
|
||||
}
|
||||
$this->num_docs_per_generation = $num_docs_per_generation;
|
||||
$this->dictionary = new IndexDictionary($this->dir_name."/dictionary",
|
||||
$this);
|
||||
}
|
||||
/**
|
||||
* Add the array of $pages to the summaries WebArchiveBundle pages being
|
||||
* stored in the partition $generation and the field used
|
||||
* to store the resulting offsets given by $offset_field.
|
||||
*
|
||||
* @param int $generation field used to select partition
|
||||
* @param string $offset_field field used to record offsets after storing
|
||||
* @param array& $pages data to store
|
||||
* @param int $visited_urls_count number to add to the count of visited urls
|
||||
* (visited urls is a smaller number than the total count of objects
|
||||
* stored in the index).
|
||||
*/
|
||||
public function addPages($generation, $offset_field, &$pages,
|
||||
$visited_urls_count)
|
||||
{
|
||||
$this->summaries->setWritePartition($generation);
|
||||
$this->summaries->addPages($offset_field, $pages);
|
||||
$this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT");
|
||||
}
|
||||
/**
|
||||
* Adds the provided mini inverted index data to the IndexArchiveBundle
|
||||
* Expects initGenerationToAdd to be called before, so generation is correct
|
||||
*
|
||||
* @param object $index_shard a mini inverted index of word_key=>doc data
|
||||
* to add to this IndexArchiveBundle
|
||||
*/
|
||||
public function addIndexData($index_shard)
|
||||
{
|
||||
crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
|
||||
$start_time = microtime(true);
|
||||
$this->getActiveShard()->appendIndexShard($index_shard);
|
||||
crawlLog("Append Index Shard: Memory usage:".memory_get_usage() .
|
||||
" Time: ".(changeInMicrotime($start_time)));
|
||||
}
|
||||
/**
|
||||
* Determines based on its size, if index_shard should be added to
|
||||
* the active generation or in a new generation should be started.
|
||||
* If so, a new generation is started, the old generation is saved, and
|
||||
* the dictionary of the old shard is copied to the bundles dictionary
|
||||
* and a log-merge performed if needed
|
||||
*
|
||||
* @param int $add_num_docs number of docs in the shard about to be added
|
||||
* @param object $callback object with join function to be
|
||||
* called if process is taking too long
|
||||
* @param bool $blocking whether there is an ongoing merge tiers operation
|
||||
* occurring, if so don't do anything and return -1
|
||||
* @return int the active generation after the check and possible change has
|
||||
* been performed
|
||||
*/
|
||||
public function initGenerationToAdd($add_num_docs, $callback = null,
|
||||
$blocking = false)
|
||||
{
|
||||
$current_num_docs = $this->getActiveShard()->num_docs;
|
||||
crawlLog("Current index shard has ".$current_num_docs." documents.");
|
||||
$memory_limit = metricToInt(ini_get("memory_limit"));
|
||||
crawlLog("Memory Indexer limit is ".$memory_limit.". Usage is ".
|
||||
memory_get_usage());
|
||||
if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation
|
||||
|| (0.55 * $memory_limit) < memory_get_usage() ) {
|
||||
if ($blocking == true) {
|
||||
return -1;
|
||||
}
|
||||
crawlLog("Switching Index Shard...");
|
||||
$switch_time = microtime(true);
|
||||
// Save current shard dictionary to main dictionary
|
||||
$this->forceSave();
|
||||
$this->addAdvanceGeneration($callback);
|
||||
crawlLog("Switch Index Shard time:".
|
||||
changeInMicrotime($switch_time));
|
||||
}
|
||||
return $this->generation_info['ACTIVE'];
|
||||
}
|
||||
/**
|
||||
* Starts a new generation, the dictionary of the old shard is copied to
|
||||
* the bundles dictionary and a log-merge performed if needed. This
|
||||
* function may be called by initGenerationToAdd as well as when resuming
|
||||
* a crawl rather than loading the periodic index of save of a too large
|
||||
* shard.
|
||||
*
|
||||
* @param object $callback object with join function to be
|
||||
* called if process is taking too long
|
||||
*/
|
||||
public function addAdvanceGeneration($callback = null)
|
||||
{
|
||||
$this->addCurrentShardDictionary($callback);
|
||||
//Set up new shard
|
||||
$this->generation_info['ACTIVE']++;
|
||||
$this->generation_info['CURRENT'] =
|
||||
$this->generation_info['ACTIVE'];
|
||||
$current_index_shard_file = $this->dir_name.
|
||||
"/posting_doc_shards/index". $this->generation_info['ACTIVE'];
|
||||
$this->current_shard = new IndexShard(
|
||||
$current_index_shard_file, $this->generation_info['ACTIVE'],
|
||||
$this->num_docs_per_generation);
|
||||
file_put_contents($this->dir_name."/generation.txt",
|
||||
serialize($this->generation_info));
|
||||
}
|
||||
/**
|
||||
* Adds the words from this shard to the dictionary
|
||||
* @param object $callback object with join function to be
|
||||
* called if process is taking too long
|
||||
*/
|
||||
public function addCurrentShardDictionary($callback = null)
|
||||
{
|
||||
$current_index_shard_file = $this->dir_name.
|
||||
"/posting_doc_shards/index". $this->generation_info['ACTIVE'];
|
||||
/* want to do the copying of dictionary as files to conserve memory
|
||||
in case merge tiers after adding to dictionary
|
||||
*/
|
||||
$this->current_shard = new IndexShard(
|
||||
$current_index_shard_file, $this->generation_info['ACTIVE'],
|
||||
$this->num_docs_per_generation, true);
|
||||
$this->dictionary->addShardDictionary($this->current_shard, $callback);
|
||||
}
|
||||
/**
|
||||
* Sets the current shard to be the active shard (the active shard is
|
||||
* what we call the last (highest indexed) shard in the bundle. Then
|
||||
* returns a reference to this shard
|
||||
* @return object last shard in the bundle
|
||||
*/
|
||||
public function getActiveShard()
|
||||
{
|
||||
if ($this->setCurrentShard($this->generation_info['ACTIVE'])) {
|
||||
return $this->getCurrentShard();
|
||||
} else if (!isset($this->current_shard) ) {
|
||||
$current_index_shard_file = $this->dir_name.
|
||||
"/posting_doc_shards/index". $this->generation_info['CURRENT'];
|
||||
$this->current_shard = new IndexShard($current_index_shard_file,
|
||||
$this->generation_info['CURRENT'],
|
||||
$this->num_docs_per_generation);
|
||||
}
|
||||
return $this->current_shard;
|
||||
}
|
||||
/**
|
||||
* Returns the shard which is currently being used to read word-document
|
||||
* data from the bundle. If one wants to write data to the bundle use
|
||||
* getActiveShard() instead. The point of this method is to allow
|
||||
* for lazy reading of the file associated with the shard.
|
||||
*
|
||||
* @param bool $force_read whether to force no advance generation and
|
||||
* merge dictionary side effects
|
||||
* @return object the currently being index shard
|
||||
*/
|
||||
public function getCurrentShard($force_read = false)
|
||||
{
|
||||
if (!isset($this->current_shard)) {
|
||||
if (!isset($this->generation_info['CURRENT'])) {
|
||||
$this->generation_info['CURRENT'] =
|
||||
$this->generation_info['ACTIVE'];
|
||||
}
|
||||
$current_index_shard_file = $this->dir_name .
|
||||
"/posting_doc_shards/index". $this->generation_info['CURRENT'];
|
||||
if (file_exists($current_index_shard_file)) {
|
||||
if (isset($this->generation_info['DISK_BASED']) &&
|
||||
$this->generation_info['DISK_BASED'] == true) {
|
||||
$this->current_shard = new IndexShard(
|
||||
$current_index_shard_file,
|
||||
$this->generation_info['CURRENT'],
|
||||
$this->num_docs_per_generation, true);
|
||||
$this->current_shard->getShardHeader();
|
||||
$this->current_shard->read_only_from_disk = true;
|
||||
} else {
|
||||
if (!$force_read && filesize($current_index_shard_file) >
|
||||
self::NO_LOAD_SIZE) {
|
||||
$this->addAdvanceGeneration();
|
||||
} else {
|
||||
$this->current_shard =
|
||||
IndexShard::load($current_index_shard_file);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$this->current_shard = new IndexShard($current_index_shard_file,
|
||||
$this->generation_info['CURRENT'],
|
||||
$this->num_docs_per_generation);
|
||||
}
|
||||
}
|
||||
return $this->current_shard;
|
||||
}
|
||||
/**
|
||||
* Sets the current shard to be the $i th shard in the index bundle.
|
||||
*
|
||||
* @param $i which shard to set the current shard to be
|
||||
* @param $disk_based whether to read the whole shard in before using or
|
||||
* leave it on disk except for pages need and use memcache
|
||||
*/
|
||||
public function setCurrentShard($i, $disk_based = false)
|
||||
{
|
||||
$this->generation_info['DISK_BASED'] = $disk_based;
|
||||
if (isset($this->generation_info['CURRENT']) &&
|
||||
isset($this->generation_info['ACTIVE']) &&
|
||||
($i == $this->generation_info['CURRENT'] ||
|
||||
$i > $this->generation_info['ACTIVE'])) {
|
||||
return false;
|
||||
} else {
|
||||
$this->generation_info['CURRENT'] = $i;
|
||||
unset($this->current_shard);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Gets the page out of the summaries WebArchiveBundle with the given
|
||||
* offset and generation
|
||||
*
|
||||
* @param int $offset byte offset in partition of desired page
|
||||
* @param int $generation which generation WebArchive to look up in
|
||||
* defaults to the same number as the current shard
|
||||
* @return array desired page
|
||||
*/
|
||||
public function getPage($offset, $generation = -1)
|
||||
{
|
||||
if ($generation == -1 ) {
|
||||
$generation = $this->generation_info['CURRENT'];
|
||||
}
|
||||
return $this->summaries->getPage($offset, $generation);
|
||||
}
|
||||
/**
|
||||
* Forces the current shard to be saved
|
||||
*/
|
||||
public function forceSave()
|
||||
{
|
||||
$this->getActiveShard()->save(false, true);
|
||||
}
|
||||
/**
|
||||
* Computes the number of occurrences of each of the supplied list of
|
||||
* word_keys
|
||||
*
|
||||
* @param array $word_keys keys to compute counts for
|
||||
* @return array associative array of key => count values.
|
||||
*/
|
||||
public function countWordKeys($word_keys)
|
||||
{
|
||||
$words_array = [];
|
||||
if (!is_array($word_keys) || count($word_keys) < 1) { return null;}
|
||||
foreach ($word_keys as $word_key) {
|
||||
$tmp = $this->dictionary->getWordInfo($word_key);
|
||||
if ($tmp === false) {
|
||||
$words_array[$word_key] = 0;
|
||||
} else {
|
||||
$count = 0;
|
||||
foreach ($tmp as $entry) {
|
||||
$count += $entry[3];
|
||||
}
|
||||
$words_array[$word_key] = $count;
|
||||
}
|
||||
}
|
||||
return $words_array;
|
||||
}
|
||||
/**
|
||||
* Gets the description, count of summaries, and number of partitions of the
|
||||
* summaries store in the supplied directory. If the file
|
||||
* arc_description.txt exists, this is viewed as a dummy index archive for
|
||||
* the sole purpose of allowing conversions of downloaded data such as arc
|
||||
* files into Yioop! format.
|
||||
*
|
||||
* @param string $dir_name path to a directory containing a summaries
|
||||
* WebArchiveBundle
|
||||
* @return array summary of the given archive
|
||||
*/
|
||||
public static function getArchiveInfo($dir_name)
|
||||
{
|
||||
if (file_exists($dir_name."/arc_description.txt")) {
|
||||
$crawl = [];
|
||||
$info = [];
|
||||
$crawl['DESCRIPTION'] = substr(
|
||||
file_get_contents($dir_name."/arc_description.txt"), 0, 256);
|
||||
$crawl['ARCFILE'] = true;
|
||||
$info['VISITED_URLS_COUNT'] = 0;
|
||||
$info['COUNT'] = 0;
|
||||
$info['NUM_DOCS_PER_PARTITION'] = 0;
|
||||
$info['WRITE_PARTITION'] = 0;
|
||||
$info['DESCRIPTION'] = serialize($crawl);
|
||||
return $info;
|
||||
}
|
||||
if (file_exists($dir_name . "/description.txt")) {
|
||||
$info = WebArchiveBundle::getArchiveInfo($dir_name);
|
||||
if (isset($info['DESCRIPTION'])) {
|
||||
return $info;
|
||||
}
|
||||
}
|
||||
return WebArchiveBundle::getArchiveInfo($dir_name."/summaries");
|
||||
}
|
||||
/**
|
||||
* Sets the archive info (DESCRIPTION, COUNT,
|
||||
* NUM_DOCS_PER_PARTITION) for the web archive bundle associated with
|
||||
* this bundle. As DESCRIPTION is used to store info about the info
|
||||
* bundle this sets the global properties of the info bundle as well.
|
||||
*
|
||||
* @param string $dir_name folder with archive bundle
|
||||
* @param array $info struct with above fields
|
||||
*/
|
||||
public static function setArchiveInfo($dir_name, $info)
|
||||
{
|
||||
WebArchiveBundle::setArchiveInfo($dir_name."/summaries", $info);
|
||||
}
|
||||
/**
|
||||
* Returns the mast time the archive info of the bundle was modified.
|
||||
*
|
||||
* @param string $dir_name folder with archive bundle
|
||||
*/
|
||||
public static function getParamModifiedTime($dir_name)
|
||||
{
|
||||
return WebArchiveBundle::getParamModifiedTime($dir_name."/summaries");
|
||||
}
|
||||
}
|
1305
src/library/IndexDictionary.php
Normal file
1305
src/library/IndexDictionary.php
Normal file
File diff suppressed because it is too large
Load diff
309
src/library/IndexManager.php
Normal file
309
src/library/IndexManager.php
Normal file
|
@ -0,0 +1,309 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/**
|
||||
* For crawlHash
|
||||
*/
|
||||
require_once __DIR__."/Utility.php";
|
||||
/**
|
||||
* Class used to manage open IndexArchiveBundle's while performing
|
||||
* a query. Ensures an easy place to obtain references to these bundles
|
||||
* and ensures only one object per bundle is instantiated in a Singleton-esque
|
||||
* way.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class IndexManager implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Open IndexArchiveBundle's managed by this manager
|
||||
* @var array
|
||||
*/
|
||||
public static $indexes = [];
|
||||
/**
|
||||
* Used to cache word lookup of posting list locations for a given
|
||||
* index
|
||||
* @var array
|
||||
*/
|
||||
public static $dictionary = [];
|
||||
/**
|
||||
* Returns a reference to the managed copy of an IndexArchiveBundle object
|
||||
* with a given timestamp or an IndexShard in the case where
|
||||
* $index_name == "feed" (for handling media feeds)
|
||||
*
|
||||
* @param string $index_name timestamp of desired IndexArchiveBundle
|
||||
* @return object the desired IndexArchiveBundle reference
|
||||
*/
|
||||
public static function getIndex($index_name)
|
||||
{
|
||||
$index_name = trim($index_name); //trim to fix postgres quirkiness
|
||||
if (!isset(self::$indexes[$index_name])) {
|
||||
if ($index_name == "feed") {
|
||||
$index_file = C\WORK_DIRECTORY."/feeds/index";
|
||||
if (file_exists($index_file)) {
|
||||
self::$indexes[$index_name] = new IndexShard(
|
||||
$index_file, 0, C\NUM_DOCS_PER_GENERATION, true);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
$index_archive_name = self::index_data_base_name . $index_name;
|
||||
$tmp = new IndexArchiveBundle(
|
||||
C\CRAWL_DIR.'/cache/'.$index_archive_name);
|
||||
if (!$tmp) {
|
||||
return false;
|
||||
}
|
||||
self::$indexes[$index_name] = $tmp;
|
||||
self::$indexes[$index_name]->setCurrentShard(0, true);
|
||||
}
|
||||
}
|
||||
return self::$indexes[$index_name];
|
||||
}
|
||||
/**
|
||||
* Returns the version of the index, so that Yioop can determine
|
||||
* how to do word lookup.The only major change to the format was
|
||||
* when word_id's went from 8 to 20 bytes which happened around Unix
|
||||
* time 1369754208.
|
||||
*
|
||||
* @param string $index_name unix timestamp of index
|
||||
* @return int 0 - if the orginal format for Yioop indexes; 1 -if 20 byte
|
||||
* word_id format
|
||||
*/
|
||||
public static function getVersion($index_name)
|
||||
{
|
||||
if (intval($index_name) < C\VERSION_0_TIMESTAMP) {
|
||||
return 0;
|
||||
}
|
||||
$tmp_index = self::getIndex($index_name);
|
||||
if (isset($tmp_index->version)) {
|
||||
return $tmp_index->version;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
/**
|
||||
* Gets an array posting list positions for each shard in the
|
||||
* bundle $index_name for the word id $hash
|
||||
*
|
||||
* @param string $index_name bundle to look $hash in
|
||||
* @param string $hash hash of phrasse or word to look up in bundle
|
||||
* dictionary
|
||||
* @param int $shift if $hash is for a phrase, how many low order
|
||||
* bits of word id to discard
|
||||
* @param string $mask if $hash is for a word, after the 9th byte what
|
||||
* meta word mask should be applied to the 20 byte hash
|
||||
* @param int $threshold after the number of results exceeds this amount
|
||||
* stop looking for more dictionary entries.
|
||||
* @param int $start_generation
|
||||
* @param int $num_distinct_generations
|
||||
* @param bool $with_remaining_total
|
||||
* @return array sequence of four tuples:
|
||||
* (index_shard generation, posting_list_offset, length, exact id
|
||||
* that match $hash)
|
||||
*/
|
||||
public static function getWordInfo($index_name, $hash, $shift = 0,
|
||||
$mask = "", $threshold = -1, $start_generation = -1,
|
||||
$num_distinct_generations = -1, $with_remaining_total = false)
|
||||
{
|
||||
$id = "$index_name:$start_generation:$num_distinct_generations";
|
||||
$index = IndexManager::getIndex($index_name);
|
||||
if (!$index->dictionary) {
|
||||
$tmp = [];
|
||||
if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS)
|
||||
&& file_exists(C\WORK_DIRECTORY."/feeds/index")) {
|
||||
//NO_FEEDS defined true in statistic_controller.php
|
||||
$use_feeds = true;
|
||||
$feed_shard = IndexManager::getIndex("feed");
|
||||
$feed_info = $feed_shard->getWordInfo($hash, true, $shift,
|
||||
$mask);
|
||||
if (is_array($feed_info)) {
|
||||
$tmp[-1] = [-1, $feed_info[0],
|
||||
$feed_info[1], $feed_info[2], $feed_info[3]];
|
||||
}
|
||||
}
|
||||
if ($tmp == []) {
|
||||
return ($with_remaining_total) ? [0, false] : false;
|
||||
}
|
||||
IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold] =
|
||||
[$feed_info[3], $tmp];
|
||||
return ($with_remaining_total) ?
|
||||
IndexManager::$dictionary[$id][$hash][$shift][$mask][
|
||||
$threshold] :
|
||||
IndexManager::$dictionary[$id][$hash][$shift][$mask][
|
||||
$threshold][1];
|
||||
}
|
||||
$len = strlen($mask);
|
||||
if ($len > 0) {
|
||||
$pre_hash = substr($hash, 0, 8) .
|
||||
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
|
||||
} else {
|
||||
$pre_hash = $hash;
|
||||
}
|
||||
if (!isset(IndexManager::$dictionary[$id][$hash][$shift][$mask][
|
||||
$threshold])) {
|
||||
$tmp = [];
|
||||
$test_mask = "";
|
||||
if (isset(IndexManager::$dictionary[$id][$pre_hash][
|
||||
$shift])) {
|
||||
foreach (IndexManager::$dictionary[$id][$pre_hash][
|
||||
$shift] as $test_mask => $data) {
|
||||
$mask_len = strlen($test_mask);
|
||||
if ($mask_len > $len) {continue; }
|
||||
$mask_found = true;
|
||||
for ($k = 0; $k < $mask_len; $k++) {
|
||||
if (ord($test_mask[$k]) > 0 &&
|
||||
$test_mask[$k] != $mask[$k]) {
|
||||
$mask_found = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ($mask_found && isset(
|
||||
IndexManager::$dictionary[$id][$pre_hash][
|
||||
$shift][$test_mask][$threshold]) ) {
|
||||
list($total, $info) =
|
||||
IndexManager::$dictionary[$id][$pre_hash
|
||||
][$shift][$test_mask][$threshold];
|
||||
$out_info = [];
|
||||
foreach ($info as $record) {
|
||||
$rid = $record[4];
|
||||
$add_flag = true;
|
||||
if ($mask != "") {
|
||||
for ($k = 0; $k < $len; $k++) {
|
||||
$loc = 8 + $k;
|
||||
if (ord($mask[$k]) > 0 &&
|
||||
isset($rid[$loc]) &&
|
||||
$rid[$loc] != $hash[$loc]) {
|
||||
$add_flag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($add_flag) {
|
||||
$out_info[$record[0]] = $record;
|
||||
} else {
|
||||
if ($record[3] < $total) {
|
||||
$total -= $record[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
IndexManager::$dictionary[$id][$hash][$shift
|
||||
][$mask] = [$total, $out_info];
|
||||
return ($with_remaining_total) ?
|
||||
IndexManager::$dictionary[$id][
|
||||
$hash][$shift][$mask] :
|
||||
IndexManager::$dictionary[$id][
|
||||
$hash][$shift][$mask][1];
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) &&
|
||||
$start_generation < 0
|
||||
&& file_exists(C\WORK_DIRECTORY."/feeds/index")) {
|
||||
//NO_FEEDS defined true in statistic_controller.php
|
||||
$use_feeds = true;
|
||||
$feed_shard = IndexManager::getIndex("feed");
|
||||
$feed_info = $feed_shard->getWordInfo($hash, true, $shift,
|
||||
$mask);
|
||||
if (is_array($feed_info)) {
|
||||
$tmp[-1] = [-1, $feed_info[0],
|
||||
$feed_info[1], $feed_info[2], $feed_info[3]];
|
||||
}
|
||||
}
|
||||
$pre_info =
|
||||
$index->dictionary->getWordInfo($hash, true, $shift, $mask,
|
||||
$threshold, $start_generation, $num_distinct_generations, true);
|
||||
if (isset($pre_info[1])) {
|
||||
list($total, $info) = $pre_info;
|
||||
} else {
|
||||
$total = 0;
|
||||
$info = [];
|
||||
}
|
||||
if (isset($tmp[-1][3])) {
|
||||
$total += $tmp[-1][3];
|
||||
$info = $tmp + $info;
|
||||
}
|
||||
IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold] =
|
||||
[$total, $info];
|
||||
}
|
||||
return ($with_remaining_total) ?
|
||||
IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold]:
|
||||
IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold][1];
|
||||
}
|
||||
/**
|
||||
* Returns the number of document that a given term or phrase appears in
|
||||
* in the given index
|
||||
*
|
||||
* @param string $term_or_phrase what to look up in the indexes dictionary
|
||||
* no mask is used for this look up
|
||||
* @param string $index_name index to look up term or phrase in
|
||||
* @param int $threshold if set and positive then once threshold many
|
||||
* documents are found the search for more documents to add to the
|
||||
* total is stoppe
|
||||
* @param int $start_generation
|
||||
* @param int $num_distinct_generations
|
||||
* @return int number of documents
|
||||
*/
|
||||
public static function numDocsTerm($term_or_phrase, $index_name,
|
||||
$threshold = -1, $start_generation = -1,
|
||||
$num_distinct_generations = C\NUM_DISTINCT_GENERATIONS)
|
||||
{
|
||||
$index = IndexManager::getIndex($index_name);
|
||||
if (!$index->dictionary) {
|
||||
return false;
|
||||
}
|
||||
$pos = -1;
|
||||
$total_num_docs = 0;
|
||||
$hashes = allCrawlHashPaths($term_or_phrase, [], [], true);
|
||||
if (!is_array($hashes)) {
|
||||
$hashes = [$hashes];
|
||||
}
|
||||
foreach ($hashes as $hash) {
|
||||
if (is_array($hash)) {
|
||||
list($num_docs, ) =
|
||||
IndexManager::getWordInfo($index_name, $hash[0],
|
||||
$hash[1], $hash[2], $threshold, $start_generation,
|
||||
$num_distinct_generations, true);
|
||||
} else {
|
||||
list($num_docs, ) =
|
||||
IndexManager::getWordInfo($index_name, $hash, 0, "",
|
||||
$threshold, $start_generation, $num_distinct_generations,
|
||||
true);
|
||||
}
|
||||
$total_num_docs += $num_docs;
|
||||
if ($threshold > 0 && $total_num_docs > $threshold) {
|
||||
return $total_num_docs;
|
||||
}
|
||||
}
|
||||
return $total_num_docs;
|
||||
}
|
||||
}
|
1873
src/library/IndexShard.php
Normal file
1873
src/library/IndexShard.php
Normal file
File diff suppressed because it is too large
Load diff
53
src/library/JavascriptUnitTest.php
Normal file
53
src/library/JavascriptUnitTest.php
Normal file
|
@ -0,0 +1,53 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* Super class of all the test classes testing Javascript functions.
|
||||
*
|
||||
* @author Akash Patel
|
||||
*/
|
||||
class JavascriptUnitTest extends UnitTest
|
||||
{
|
||||
/**
|
||||
* {@inheritDocs}
|
||||
*/
|
||||
public function setUp()
|
||||
{
|
||||
}
|
||||
/**
|
||||
* {@inheritDocs}
|
||||
*/
|
||||
public function tearDown()
|
||||
{
|
||||
}
|
||||
}
|
||||
|
52
src/library/Join.php
Normal file
52
src/library/Join.php
Normal file
|
@ -0,0 +1,52 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* Marker interface used to say that a class has supports a join()
|
||||
* callback method. IndexArchiveBundle has methods which take objects
|
||||
* that implement Join. For activities which may take a long time
|
||||
* such as index saving index tier merging IndexArchiveBundle will
|
||||
* periodically call the Join objects join method so that it can continue
|
||||
* processing rather than blocking entirely until the long running method
|
||||
* completes
|
||||
*
|
||||
* @author Chris Pollett
|
||||
* @see WebQueueBundle
|
||||
*/
|
||||
interface Join
|
||||
{
|
||||
/**
|
||||
* A callback function which will be invoked periodically by a method
|
||||
* of another object that runs a long time.
|
||||
*/
|
||||
public function join();
|
||||
}
|
483
src/library/LocaleFunctions.php
Normal file
483
src/library/LocaleFunctions.php
Normal file
|
@ -0,0 +1,483 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* This file contains global functions connected to localization that
|
||||
* are used throughout the web site part of Yioop!
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\models\LocaleModel;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Attempts to guess the user's locale based on the request, session,
|
||||
* and user-agent data
|
||||
*
|
||||
* @return string IANA language tag of the guessed locale
|
||||
*/
|
||||
function guessLocale()
|
||||
{
|
||||
/* the request variable l and the browser's HTTP_ACCEPT_LANGUAGE
|
||||
are used to determine the locale */
|
||||
if (isset($_SERVER['HTTP_ACCEPT_LANGUAGE'])) {
|
||||
$l_parts = explode(",", $_SERVER['HTTP_ACCEPT_LANGUAGE']);
|
||||
if (count($l_parts) > 0) {
|
||||
$guess_l = $l_parts[0];
|
||||
}
|
||||
$guess_map = [
|
||||
"cn" => "zh-CN",
|
||||
"en" => "en-US",
|
||||
"en-us" => "en-US",
|
||||
"en-US" => "en-US",
|
||||
"fr" => "fr-FR",
|
||||
"ko" => "ko",
|
||||
"in" => "in-ID",
|
||||
"ja" => "ja",
|
||||
"vi" => "vi-VN",
|
||||
"vi-vn" => "vi-VN",
|
||||
"vi-VN" => "vi-VN",
|
||||
"zh" => "zh-CN",
|
||||
"zh-CN" => "zh-CN",
|
||||
"zh-cn" => "zh-CN",
|
||||
];
|
||||
if (isset($guess_map[$guess_l])) {
|
||||
$guess_l = $guess_map[$guess_l];
|
||||
}
|
||||
}
|
||||
if (isset($_SESSION['l']) || isset($_REQUEST['l']) || isset($guess_l)) {
|
||||
$l = (isset($_REQUEST['l'])) ? $_REQUEST['l'] :
|
||||
((isset($_SESSION['l'])) ? $_SESSION['l'] : $guess_l);
|
||||
if (strlen($l) < 10) {
|
||||
$l = addslashes($l);
|
||||
if (is_dir(C\LOCALE_DIR . "/" . str_replace("-", "_", $l))) {
|
||||
$locale_tag = $l;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!isset($locale_tag)) {
|
||||
$locale_tag = C\DEFAULT_LOCALE;
|
||||
}
|
||||
return $locale_tag;
|
||||
}
|
||||
/**
|
||||
* Attempts to guess the user's locale based on a string sample
|
||||
*
|
||||
* @param string $phrase_string used to make guess
|
||||
* @param string $locale_tag language tag to use if can't guess -- if not
|
||||
* provided uses current locale's value
|
||||
* @param int threshold number of chars to guess a particular encoding
|
||||
* @return string IANA language tag of the guessed locale
|
||||
|
||||
*/
|
||||
function guessLocaleFromString($phrase_string, $locale_tag = null)
|
||||
{
|
||||
$original_phrase_string = $phrase_string;
|
||||
$locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag;
|
||||
$sub = C\PUNCT."|[0-9]|\s";
|
||||
$phrase_string = preg_replace('/'.$sub.'/u', "", $phrase_string);
|
||||
$phrase_string = mb_convert_encoding($phrase_string, "UTF-32", "UTF-8");
|
||||
$len = strlen($phrase_string);
|
||||
$guess['zh-CN'] = 0;
|
||||
$guess['ru'] = 0;
|
||||
$guess['he'] = 0;
|
||||
$guess['ar'] = 0;
|
||||
$guess['th'] = 0;
|
||||
$guess['ja'] = 0;
|
||||
$guess['ko'] = 0;
|
||||
$guess[$locale_tag] = 1;
|
||||
for ($i = 0; $i < $len; $i += 4) {
|
||||
$start = ord($phrase_string[$i+2]);
|
||||
$next = ord($phrase_string[$i+3]);
|
||||
if ($start >= 78 && $start <= 159) {
|
||||
$guess['zh-CN'] += 4;
|
||||
} else if ($start == 4 || ($start == 5 && $next < 48)) {
|
||||
$guess['ru']++;
|
||||
} else if ($start == 5 && $next >= 144) {
|
||||
$guess['he'] += 2;
|
||||
} else if ($start >= 6 && $start <= 7) {
|
||||
if ($locale_tag == "fa") {
|
||||
$guess[$locale_tag] +=2;
|
||||
} else {
|
||||
$guess['ar'] += 2;
|
||||
}
|
||||
} else if ($start == 14 && $next < 128) {
|
||||
$guess['th'] += 2;
|
||||
} else if ($start >= 48 && $start <= 49) {
|
||||
$guess['ja'] += 3;
|
||||
} else if ($start == 17 || $start >= 172 && $start < 215) {
|
||||
$guess['ko'] += 2;
|
||||
} else if ($start == 0 && $next < 128) {
|
||||
$guess[$locale_tag]++; // assume ascii is from $locale_tag
|
||||
}
|
||||
}
|
||||
$num_points = ($len / 4) - 1; //there will be a lead and tail space
|
||||
$max = $guess[$locale_tag];
|
||||
if ($num_points >= 0 ) {
|
||||
foreach ($guess as $tag => $cnt) {
|
||||
if ($cnt >= $num_points && $cnt > $max) {
|
||||
$locale_tag = $tag;
|
||||
$max = $cnt;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($locale_tag == 'en-US') {
|
||||
$locale_tag = checkQuery($original_phrase_string);
|
||||
}
|
||||
return $locale_tag;
|
||||
}
|
||||
/**
|
||||
* Tries to find wether query belongs to a programming language
|
||||
*
|
||||
* @param string $query query entered by user
|
||||
*
|
||||
* @return string $lang programming language for the the query provided
|
||||
*/
|
||||
function checkQuery($query)
|
||||
{
|
||||
$programming_language_map = ['java:' => 'java', 'python:' => 'py'];
|
||||
$control_word = "/^(java:|python:)/";
|
||||
$position = preg_match($control_word, trim($query),
|
||||
$matches, PREG_OFFSET_CAPTURE);
|
||||
if (isset($matches[0][0])) {
|
||||
$matched_word = $matches[0][0];
|
||||
if (isset($programming_language_map[$matched_word])) {
|
||||
$lang = $programming_language_map[$matched_word];
|
||||
} else {
|
||||
$lang = 'en-US';
|
||||
}
|
||||
} else {
|
||||
$lang = 'en-US';
|
||||
}
|
||||
return $lang;
|
||||
}
|
||||
/**
|
||||
* Tries to guess at a language tag based on the name of a character
|
||||
* encoding
|
||||
*
|
||||
* @param string $encoding a character encoding name
|
||||
*
|
||||
* @return string guessed language tag
|
||||
*/
|
||||
function guessLangEncoding($encoding)
|
||||
{
|
||||
$lang = ["EUC-JP", "Shift_JIS", "JIS", "ISO-2022-JP"];
|
||||
if (in_array($encoding, $lang)) {
|
||||
return "ja";
|
||||
}
|
||||
$lang = ["EUC-CN", "GBK", "GB2312", "EUC-TW", "HZ", "CP936",
|
||||
"BIG-5", "CP950"];
|
||||
if (in_array($encoding, $lang)) {
|
||||
return "zh-CN";
|
||||
}
|
||||
$lang = ["EUC-KR", "UHC", "CP949", "ISO-2022-KR"];
|
||||
if (in_array($encoding, $lang)) {
|
||||
return "ko";
|
||||
}
|
||||
$lang = ["Windows-1251", "CP1251", "CP866", "IBM866", "KOI8-R"];
|
||||
if (in_array($encoding, $lang)) {
|
||||
return "ru";
|
||||
}
|
||||
return 'en';
|
||||
}
|
||||
/**
|
||||
* Tries to guess the encoding used for an Html document
|
||||
*
|
||||
* @param string $html a character encoding name
|
||||
* @param string $return_loc_info if meta http-equiv info was used to
|
||||
* find the encoding, then if $return_loc_info is true, we
|
||||
* return the location of charset substring. This allows converting to
|
||||
* UTF-8 later so cached pages will display correctly and
|
||||
* redirects without char encoding won't be given a different hash.
|
||||
*
|
||||
* @return mixed either string or array if string then guessed encoding,
|
||||
* if array guessed encoding, start_pos of where charset info came from,
|
||||
* length
|
||||
*/
|
||||
function guessEncodingHtml($html, $return_loc_info = false)
|
||||
{
|
||||
/*
|
||||
If the doc is HTML and it uses a http-equiv to set the encoding
|
||||
then we override what the server says (if anything). As we
|
||||
are going to convert to UTF-8 we remove the charset info
|
||||
from the meta tag so cached pages will display correctly and
|
||||
redirects without char encoding won't be given a different hash.
|
||||
*/
|
||||
$end_head = stripos($html, "</head");
|
||||
if ($end_head) {
|
||||
$reg = "/charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?/u";
|
||||
$is_match = preg_match($reg, $html, $match);
|
||||
if (!$is_match) {
|
||||
$reg = "charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?";
|
||||
mb_regex_encoding("UTF-8");
|
||||
mb_ereg_search_init($html);
|
||||
mb_ereg_search($reg);
|
||||
$match = mb_ereg_search_getregs();
|
||||
if (isset($match[0])) {
|
||||
$is_match = true;
|
||||
}
|
||||
}
|
||||
if ($is_match && isset($match[6])) {
|
||||
$len_c = strlen($match[0]);
|
||||
if (($match[6] == "'" || $match[6] == '"') &&
|
||||
$match[3] != $match[6]) {
|
||||
$len_c--;
|
||||
}
|
||||
$start_charset = strpos($html, $match[0]);
|
||||
if ($start_charset + $len_c < $end_head) {
|
||||
if (isset($match[4])) {
|
||||
$encoding = strtoupper($match[4]);
|
||||
if ($return_loc_info) {
|
||||
return [$encoding, $start_charset, $len_c];
|
||||
}
|
||||
return $encoding;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return mb_detect_encoding($html, 'auto');
|
||||
}
|
||||
|
||||
/**
|
||||
* Translate the supplied arguments into the current locale.
|
||||
* This function takes a variable number of arguments. The first
|
||||
* being an identifier to translate. Additional arguments
|
||||
* are used to interpolate values in for %s's in the translation.
|
||||
*
|
||||
* @param string string_identifier identifier to be translated
|
||||
* @param mixed additional_args used for interpolation in translated string
|
||||
* @return string translated string
|
||||
*/
|
||||
function tl()
|
||||
{
|
||||
$locale = LocaleModel::$current_locale;
|
||||
if (!is_object($locale)) {
|
||||
return false;
|
||||
}
|
||||
$args = func_get_args();
|
||||
$translation = $locale->translate($args);
|
||||
if (!trim($translation)) {
|
||||
$translation = $args[0];
|
||||
}
|
||||
return $translation;
|
||||
}
|
||||
/**
|
||||
* Sets the language to be used for locale settings
|
||||
*
|
||||
* @param string $locale_tag the tag of the language to use to determine
|
||||
* locale settings
|
||||
*/
|
||||
function setLocaleObject($locale_tag)
|
||||
{
|
||||
$locale_model = C\NS_MODELS . "LocaleModel";
|
||||
$locale = new $locale_model();
|
||||
$locale->initialize($locale_tag);
|
||||
LocaleModel::$current_locale = $locale;
|
||||
}
|
||||
/**
|
||||
* Gets the language tag (for instance, en_US for American English) of the
|
||||
* locale that is currently being used. This function has the side
|
||||
* effect of setting Yioop's current locale.
|
||||
*
|
||||
* @return string the tag of the language currently being used for locale
|
||||
* settings
|
||||
*/
|
||||
function getLocaleTag()
|
||||
{
|
||||
$locale = LocaleModel::$current_locale;
|
||||
if (!$locale) {
|
||||
$locale_tag = guessLocale();
|
||||
setLocaleObject($locale_tag);
|
||||
return $locale_tag;
|
||||
}
|
||||
return $locale->getLocaleTag();
|
||||
}
|
||||
/**
|
||||
* Returns the current language directions.
|
||||
*
|
||||
* @return string ltr or rtl depending on if the language is left-to-right
|
||||
* or right-to-left
|
||||
*/
|
||||
function getLocaleDirection()
|
||||
{
|
||||
$locale = LocaleModel::$current_locale;
|
||||
return $locale->getLocaleDirection();
|
||||
}
|
||||
/**
|
||||
* Returns the query statistics info for the current llocalt.
|
||||
*
|
||||
* @return array consisting of queries and elapses times for locale computations
|
||||
*/
|
||||
function getLocaleQueryStatistics()
|
||||
{
|
||||
$locale = LocaleModel::$current_locale;
|
||||
$query_info = [];
|
||||
$query_info['QUERY_LOG'] = $locale->db->query_log;
|
||||
$query_info['TOTAL_ELAPSED_TIME'] = $locale->db->total_time;
|
||||
return $query_info;
|
||||
}
|
||||
/**
|
||||
* Returns the current locales method of writing blocks (things like divs or
|
||||
* paragraphs).A language like English puts blocks one after another from the
|
||||
* top of the page to the bottom. Other languages like classical Chinese list
|
||||
* them from right to left.
|
||||
*
|
||||
* @return string tb lr rl depending on the current locales block progression
|
||||
*/
|
||||
function getBlockProgression()
|
||||
{
|
||||
$locale = LocaleModel::$current_locale;
|
||||
return $locale->getBlockProgression();
|
||||
|
||||
}
|
||||
/**
|
||||
* Returns the writing mode of the current locale. This is a combination of the
|
||||
* locale direction and the block progression. For instance, for English the
|
||||
* writing mode is lr-tb (left-to-right top-to-bottom).
|
||||
*
|
||||
* @return string the locales writing mode
|
||||
*/
|
||||
function getWritingMode()
|
||||
{
|
||||
$locale = LocaleModel::$current_locale;
|
||||
return $locale->getWritingMode();
|
||||
|
||||
}
|
||||
/**
|
||||
* Convert the string $str encoded in Windows-1256 into UTF-8
|
||||
*
|
||||
* @param string $str Windows-1256 string to convert
|
||||
* @return string the UTF-8 equivalent
|
||||
*/
|
||||
function w1256ToUTF8($str)
|
||||
{
|
||||
static $conv = [
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
|
||||
0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, 0x0010, 0x0011,
|
||||
0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001A,
|
||||
0x001B, 0x001C, 0x001D, 0x001E, 0x001F, 0x0020, 0x0021, 0x0022, 0x0023,
|
||||
0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C,
|
||||
0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
|
||||
0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E,
|
||||
0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
|
||||
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050,
|
||||
0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
|
||||
0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x0060, 0x0061, 0x0062,
|
||||
0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B,
|
||||
0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
|
||||
0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D,
|
||||
0x007E, 0x007F, 0x20AC, 0x067E, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020,
|
||||
0x2021, 0x02C6, 0x2030, 0x0679, 0x2039, 0x0152, 0x0686, 0x0698, 0x0688,
|
||||
0x06AF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x06A9,
|
||||
0x2122, 0x0691, 0x203A, 0x0153, 0x200C, 0x200D, 0x06BA, 0x00A0, 0x060C,
|
||||
0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x06BE,
|
||||
0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, 0x00B0, 0x00B1, 0x00B2, 0x00B3,
|
||||
0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x061B, 0x00BB, 0x00BC,
|
||||
0x00BD, 0x00BE, 0x061F, 0x06C1, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625,
|
||||
0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E,
|
||||
0x062F, 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x00D7,
|
||||
0x0637, 0x0638, 0x0639, 0x063A, 0x0640, 0x0641, 0x0642, 0x0643, 0x00E0,
|
||||
0x0644, 0x00E2, 0x0645, 0x0646, 0x0647, 0x0648, 0x00E7, 0x00E8, 0x00E9,
|
||||
0x00EA, 0x00EB, 0x0649, 0x064A, 0x00EE, 0x00EF, 0x064B, 0x064C, 0x064D,
|
||||
0x064E, 0x00F4, 0x064F, 0x0650, 0x00F7, 0x0651, 0x00F9, 0x0652, 0x00FB,
|
||||
0x00FC, 0x200E, 0x200F, 0x06D2
|
||||
];
|
||||
$len = strlen($str);
|
||||
$out = "";
|
||||
for ($i = 0; $i < $len; $i++) {
|
||||
$out .= utf8chr($conv[ord($str[$i])]);
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
/**
|
||||
* Given a unicode codepoint convert it to UTF-8
|
||||
*
|
||||
* @param int $code the codepoint to convert
|
||||
* @return string the corresponding UTF-8 string
|
||||
*/
|
||||
function utf8chr($code)
|
||||
{
|
||||
if ($code <= 0x7F)
|
||||
return chr($code);
|
||||
if ($code <= 0x7FF)
|
||||
return pack("C*", ($code >> 6)+192, ($code & 63) + 128);
|
||||
if ($code <= 0xFFFF)
|
||||
return pack("C*", ($code >> 12)+224, (($code>>6) & 63) + 128,
|
||||
($code&63)+128);
|
||||
if ($code <= 0x1FFFFF)
|
||||
return pack("C*", ($code >> 18) + 240, (($code >> 12) & 63) + 128,
|
||||
(($code >> 6) & 63) + 128, ($code & 63) + 128);
|
||||
return '';
|
||||
}
|
||||
/**
|
||||
* Function for formatting a date string based on the locale.
|
||||
* @param $timestamp is the crawl time
|
||||
* @param $locale_tag is the tag for locale
|
||||
* @return string formatted date string
|
||||
*/
|
||||
function formatDateByLocale($timestamp, $locale_tag)
|
||||
{
|
||||
switch ($locale_tag) {
|
||||
case 'de':
|
||||
setlocale(LC_ALL,'deu');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'en-US':
|
||||
setlocale(LC_ALL,'enu');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'es':
|
||||
setlocale(LC_ALL,'esp');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'fr-FR':
|
||||
setlocale(LC_ALL,'fra');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'it':
|
||||
setlocale(LC_ALL,'ita');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'ja':
|
||||
setlocale(LC_ALL,'jpn');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'ko':
|
||||
setlocale(LC_ALL,'kor');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'pl':
|
||||
setlocale(LC_ALL,'plk');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'ru':
|
||||
setlocale(LC_ALL,'rus');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
case 'tr':
|
||||
setlocale(LC_ALL,'trk');
|
||||
return strftime("%B %d %Y %H:%M",$timestamp);
|
||||
default:
|
||||
return date("F d Y H:i", intval($timestamp));
|
||||
}
|
||||
}
|
390
src/library/MailServer.php
Normal file
390
src/library/MailServer.php
Normal file
|
@ -0,0 +1,390 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\library\AnalyticsManager;
|
||||
use seekquarry\yioop\library\MediaConstants;
|
||||
|
||||
/**
|
||||
* Timing functions
|
||||
*/
|
||||
require_once __DIR__."/Utility.php";
|
||||
/**
|
||||
* A small class for communicating with an SMTP server. Used to avoid
|
||||
* configuration issues that might be needed with PHP's built-in mail()
|
||||
* function. Here is an example of how one might use this class:
|
||||
*
|
||||
* $server = new MailServer('somewhere.com', 587, 'someone', 'pword', 'tls');
|
||||
* $to = "cool@place.com";
|
||||
* $from = "someone@somewhere.com";
|
||||
* $subject = "Test Mail";
|
||||
* $message = "This is a test";
|
||||
* $server->send($subject, $from, $to, $message);
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class MailServer implements MediaConstants
|
||||
{
|
||||
/**
|
||||
* Email address of default mail sender
|
||||
* @var string
|
||||
*/
|
||||
public $sender_email;
|
||||
/**
|
||||
* Hostname of default mail sender
|
||||
* @var string
|
||||
*/
|
||||
public $sender_host;
|
||||
/**
|
||||
* Domain name of the SMTP server
|
||||
* @var string
|
||||
*/
|
||||
public $server;
|
||||
/**
|
||||
* Port number the mail server is running on
|
||||
* @var int
|
||||
*/
|
||||
public $port;
|
||||
/**
|
||||
* If auth is used, the username to log into the SMTP server with
|
||||
* @var string
|
||||
*/
|
||||
public $login;
|
||||
/**
|
||||
* If auth is used, the password to log into the SMTP server with
|
||||
* @var string
|
||||
*/
|
||||
public $password;
|
||||
/**
|
||||
* Either false if no security/auth used or ssl or tls
|
||||
* @var mixed
|
||||
*/
|
||||
public $secure;
|
||||
/**
|
||||
* End of line string for an SMTP server
|
||||
*/
|
||||
const EOL = "\r\n";
|
||||
/**
|
||||
* How long before timeout when making a connection to an SMTP server
|
||||
*/
|
||||
const SMTP_TIMEOUT = 10;
|
||||
/**
|
||||
* Length of an SMTP response code
|
||||
*/
|
||||
const SMTP_CODE_LEN = 3;
|
||||
/**
|
||||
* Service ready for requests
|
||||
*/
|
||||
const SERVER_READY = 220;
|
||||
/**
|
||||
* SMTP last action okay
|
||||
*/
|
||||
const OKAY = 250;
|
||||
/**
|
||||
* authentication successful
|
||||
*/
|
||||
const GO_AHEAD = 235;
|
||||
/**
|
||||
* Send next authentication item
|
||||
*/
|
||||
const CONT_REQ = 334;
|
||||
/**
|
||||
* Ready for the actual mail input
|
||||
*/
|
||||
const START_INPUT = 354;
|
||||
/**
|
||||
* Encapuslates the domain and credentials of a SMTP server
|
||||
* in a MailServer object
|
||||
*
|
||||
* @param string $sender_email who mail will be sent from (can be
|
||||
* overwritten)
|
||||
* @param string $server domain name of machine will connect to
|
||||
* @param int $port port on that machine
|
||||
* @param string $login username to use for authentication ("" if no
|
||||
* auth)
|
||||
* @param string $password password to use for authentication ("" if no
|
||||
* auth)
|
||||
* @param mixed $secure false is SSL and TLS not used, otherwise SSL or TLS
|
||||
*/
|
||||
public function __construct($sender_email, $server, $port, $login,
|
||||
$password, $secure = false)
|
||||
{
|
||||
$this->sender_email = $sender_email;
|
||||
$mail_parts = explode("@", $this->sender_email);
|
||||
$this->sender_host = (isset($mail_parts[1])) ? $mail_parts[1] :
|
||||
"dev.null";
|
||||
$this->server = $server;
|
||||
if ($secure == "ssl") {
|
||||
'ssl://'.$server;
|
||||
}
|
||||
$this->port = $port;
|
||||
$this->login = $login;
|
||||
$this->password = $password;
|
||||
$this->secure = $secure;
|
||||
$this->connection = null;
|
||||
$this->messages = "";
|
||||
}
|
||||
/**
|
||||
* Connects to and if needs be authenticates with a SMTP server
|
||||
*
|
||||
* @return bool whether the session was successfully established
|
||||
*/
|
||||
public function startSession()
|
||||
{
|
||||
$this->connection = fsockopen($this->server, $this->port, $errno,
|
||||
$errstr, self::SMTP_TIMEOUT);
|
||||
if (!$this->connection) {
|
||||
$this->messages .= "Could not connect to smtp server\n";
|
||||
return false;
|
||||
}
|
||||
if ($this->readResponseGetCode() != self::SERVER_READY) {
|
||||
$this->messages .= "SMTP error\n";
|
||||
return false;
|
||||
}
|
||||
$hostname = $this->sender_host;
|
||||
$this->smtpCommand("HELO $hostname");
|
||||
if ($this->secure == 'tls') {
|
||||
if ($this->smtpCommand('STARTTLS') != self::SERVER_READY) {
|
||||
$this->messages .= "Cannot start TLS\n";
|
||||
return false;
|
||||
}
|
||||
stream_socket_enable_crypto($this->connection, true,
|
||||
STREAM_CRYPTO_METHOD_TLS_CLIENT);
|
||||
if ($this->smtpCommand("HELO $hostname") != self::OKAY) {
|
||||
$this->messages .= "TLS HELO error\n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if ($this->login != "" && $this->password != "") {
|
||||
if ($this->smtpCommand('AUTH LOGIN') != self::CONT_REQ) {
|
||||
$this->messages .= "Authentication Error Auth Login\n";
|
||||
return false;
|
||||
}
|
||||
if ($this->smtpCommand(base64_encode($this->login))
|
||||
!= self::CONT_REQ) {
|
||||
$this->messages .= "Authentication Error Username Transition\n";
|
||||
return false;
|
||||
}
|
||||
if ($this->smtpCommand(base64_encode($this->password)) !=
|
||||
self::GO_AHEAD) {
|
||||
$this->messages .= "Authentication Error Password Transition\n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Closes the currently active SMTP session
|
||||
*/
|
||||
public function endSession()
|
||||
{
|
||||
$this->smtpCommand('QUIT');
|
||||
fclose($this->connection);
|
||||
}
|
||||
/**
|
||||
* Reads data from an SMTP server until a command response code detected
|
||||
*
|
||||
* @return string three byte response code
|
||||
*/
|
||||
public function readResponseGetCode()
|
||||
{
|
||||
$data = "";
|
||||
while($line = fgets($this->connection)) {
|
||||
$data .= $line;
|
||||
if ($line[self::SMTP_CODE_LEN] == ' ') { break; }
|
||||
}
|
||||
$this->messages .= $data;
|
||||
return substr($data, 0, self::SMTP_CODE_LEN);
|
||||
}
|
||||
/**
|
||||
* Sends a single SMTP command to the current SMTP server and
|
||||
* then returns the SMTP response code
|
||||
*
|
||||
* @param string $command the command to execute
|
||||
* @return string three character integer response code
|
||||
*/
|
||||
public function smtpCommand($command)
|
||||
{
|
||||
$this->messages .= htmlentities($command)."\n";
|
||||
fputs($this->connection, $command . self::EOL);
|
||||
return $this->readResponseGetCode();
|
||||
}
|
||||
/**
|
||||
* Sends (or queues for media updater)an email
|
||||
* (much like PHP's mail command, but not requiring
|
||||
* a configured smtp server on the current machine)
|
||||
*
|
||||
* @param string $subject subject line of the email
|
||||
* @param string $from sender email address
|
||||
* @param string $to recipient email address
|
||||
* @param string $message message body for the email
|
||||
*/
|
||||
public function send($subject, $from, $to, $message)
|
||||
{
|
||||
$start_time = microtime(true);
|
||||
if ($from == "") {
|
||||
$from = $this->sender_email;
|
||||
}
|
||||
if (C\SEND_MAIL_MEDIA_UPDATER == "true") {
|
||||
$this->sendQueue($subject, $from, $to, $message);
|
||||
} else {
|
||||
$this->sendImmediate($subject, $from, $to, $message);
|
||||
}
|
||||
|
||||
if (C\QUERY_STATISTICS) {
|
||||
$current_messages = AnalyticsManager::get("MAIL_MESSAGES");
|
||||
if (!$current_messages) {
|
||||
$current_messages = [];
|
||||
}
|
||||
$total_time = AnalyticsManager::get("MAIL_TOTAL_TIME");
|
||||
if (!$total_time) {
|
||||
$total_time = 0;
|
||||
}
|
||||
$elapsed_time = changeInMicrotime($start_time);
|
||||
$total_time += $elapsed_time;
|
||||
$current_messages[] = [
|
||||
"QUERY" => "<p>Send Mail</p>".
|
||||
"<pre>" . wordwrap($this->messages, 60, "\n", true) .
|
||||
"</pre>",
|
||||
"ELAPSED_TIME" => $elapsed_time
|
||||
];
|
||||
AnalyticsManager::set("MAIL_MESSAGES", $current_messages);
|
||||
AnalyticsManager::set("MAIL_TOTAL_TIME", $total_time);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Sends immediately an email (as opposed to queueing for a future process
|
||||
* to send)
|
||||
*
|
||||
* @param string $subject subject line of the email
|
||||
* @param string $from sender email address
|
||||
* @param string $to recipient email address
|
||||
* @param string $message message body for the email
|
||||
*/
|
||||
public function sendImmediate($subject, $from, $to, $message)
|
||||
{
|
||||
$eol = self::EOL;
|
||||
if (C\USE_MAIL_PHP) {
|
||||
$header = "From: " . $from . $eol;
|
||||
mail($to, $subject, $message, $header);
|
||||
return;
|
||||
}
|
||||
$this->messages = "";
|
||||
$mail = "Date: " . date(DATE_RFC822) . $eol;
|
||||
$mail .= "Subject: " . $subject . $eol;
|
||||
$mail .= "From: " . $from . $eol;
|
||||
$mail .= "To: ". $to . $eol;
|
||||
$mail .= $eol . $eol . $message. $eol . ".";
|
||||
$commands = [
|
||||
"MAIL FROM: <$from>" => self::OKAY,
|
||||
"RCPT TO: <$to>" => self::OKAY,
|
||||
"DATA" => self::START_INPUT,
|
||||
$mail => self::OKAY
|
||||
];
|
||||
if ($this->startSession()) {
|
||||
foreach ($commands as $command => $good_response) {
|
||||
$response = $this->smtpCommand($command);
|
||||
if ($response != $good_response) {
|
||||
$this->messages .=
|
||||
"$command failed!! $response $good_response\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
$this->endSession();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends an email to the media updater mail queue
|
||||
*
|
||||
* @param string $subject subject line of the email
|
||||
* @param string $from sender email address
|
||||
* @param string $to recipient email address
|
||||
* @param string $message message body for the email
|
||||
*/
|
||||
public function sendQueue($subject, $from, $to, $message)
|
||||
{
|
||||
$mail_directory = C\WORK_DIRECTORY . self::MAIL_FOLDER;
|
||||
if (!file_exists($mail_directory)) {
|
||||
mkdir($mail_directory);
|
||||
setWorldPermissions($mail_directory);
|
||||
if (!file_exists($mail_directory)) {
|
||||
crawlLog("Could not create mail directory!");
|
||||
exit();
|
||||
}
|
||||
}
|
||||
$files = glob($mail_directory."/*.txt");
|
||||
$file_count = count($files);
|
||||
$current_count = 0;
|
||||
$current_time = time();
|
||||
$diff = 0;
|
||||
if ($file_count > 0) {
|
||||
$file = end($files);
|
||||
$file_name = str_replace($mail_directory."/", "", $file);
|
||||
$last_file_time = substr($file_name, 0, -4);
|
||||
$diff = $current_time - $last_file_time;
|
||||
}
|
||||
$mail_details = serialize(array($subject, $from, $to, $message));
|
||||
$this->messages = "Queuing: " . $mail_details;
|
||||
if ($diff > C\MAX_MAIL_TIMESTAMP_LIMIT || $file_count == 0)
|
||||
{
|
||||
crawlLog("...Creating a new file for next mailer batch!\n");
|
||||
$file_time = time();
|
||||
$fp = fopen($mail_directory . "/" . $file_time . ".txt", "a+");
|
||||
if (flock($fp, LOCK_EX | LOCK_NB)) {
|
||||
crawlLog("....Lock for mail file acquired!" .
|
||||
" Sending emails!\n");
|
||||
fwrite($fp, self::MESSAGE_SEPARATOR . $mail_details);
|
||||
fwrite($fp, PHP_EOL);
|
||||
flock($fp, LOCK_UN);
|
||||
setWorldPermissions($mail_directory . "/" .
|
||||
$file_time . ".txt");
|
||||
} else {
|
||||
crawlLog("Could not acquire the lock " .
|
||||
" for $file_time.txt!\n");
|
||||
}
|
||||
} else {
|
||||
$fp = fopen($mail_directory."/".$last_file_time.".txt", "a+");
|
||||
if (flock($fp, LOCK_EX | LOCK_NB)) {
|
||||
crawlLog("....Lock acquired! Sending emails now!\n");
|
||||
fwrite($fp, $mail_details);
|
||||
fwrite($fp, PHP_EOL);
|
||||
flock($fp, LOCK_UN);
|
||||
setWorldPermissions($mail_directory . "/" .
|
||||
$last_file_time . ".txt");
|
||||
} else {
|
||||
crawlLog("Could not acquire the lock! for $file!\n");
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
77
src/library/MediaConstants.php
Normal file
77
src/library/MediaConstants.php
Normal file
|
@ -0,0 +1,77 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* Shared constants and enums used by components that are involved in the
|
||||
* media related operations
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
interface MediaConstants
|
||||
{
|
||||
/**
|
||||
* Used to define folder used for
|
||||
* placing video files to be converted.
|
||||
*/
|
||||
const CONVERT_FOLDER = "/schedules/media_convert";
|
||||
/**
|
||||
* Used to define folder used for
|
||||
* placing video files after conversion.
|
||||
*/
|
||||
const CONVERTED_FOLDER = "/schedules/media_converted";
|
||||
/**
|
||||
* The text file used to recognize the video file is
|
||||
* about to be split.
|
||||
*/
|
||||
const SPLIT_FILE = "/split.txt";
|
||||
/* The text file used to store the info of video file. */
|
||||
const FILE_INFO = "/file_info.txt";
|
||||
/**
|
||||
* The text file used to store the count of split files
|
||||
* generated from a video file.
|
||||
*/
|
||||
const COUNT_FILE = "/count.txt";
|
||||
/**
|
||||
* The text file used to store the list of split file
|
||||
* names to concatenate them.
|
||||
*/
|
||||
const ASSEMBLE_FILE = "/ready_to_assemble.txt";
|
||||
/**
|
||||
* Used to place text files(mailer lists) for sending
|
||||
* in batches.
|
||||
*/
|
||||
const MAIL_FOLDER = "/schedules/mail";
|
||||
/**
|
||||
* Magic string used to separate mail messages
|
||||
*/
|
||||
const MESSAGE_SEPARATOR = "+-7b6Ze3ef#a";
|
||||
}
|
341
src/library/NWordGrams.php
Normal file
341
src/library/NWordGrams.php
Normal file
|
@ -0,0 +1,341 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Ravi Dhillon ravi.dhillon@yahoo.com, Chris Pollett
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Library of functions used to create and extract n word grams
|
||||
*
|
||||
* @author Ravi Dhillon (Bigram Version), Chris Pollett (ngrams + rewrite +
|
||||
* support for page count dumps)
|
||||
*/
|
||||
class NWordGrams
|
||||
{
|
||||
/**
|
||||
* Static copy of n-grams files
|
||||
* @var object
|
||||
*/
|
||||
protected static $ngrams = null;
|
||||
/**
|
||||
* How many bytes to read in one go from wiki file when creating filter
|
||||
*/
|
||||
const BLOCK_SIZE = 8192;
|
||||
/**
|
||||
* Suffix appended to language tag to create the
|
||||
* filter file name containing bigrams.
|
||||
*/
|
||||
const FILTER_SUFFIX = "_word_grams.ftr";
|
||||
/**
|
||||
* Suffix appended to language tag to create the
|
||||
* text file name containing bigrams.
|
||||
*/
|
||||
const TEXT_SUFFIX = "_word_grams.txt";
|
||||
const WIKI_DUMP_REDIRECT = 0;
|
||||
const WIKI_DUMP_TITLE = 1;
|
||||
const PAGE_COUNT_WIKIPEDIA = 2;
|
||||
const PAGE_COUNT_WIKTIONARY = 3;
|
||||
/**
|
||||
* Says whether or not phrase exists in the N word gram Bloom Filter
|
||||
*
|
||||
* @param $phrase what to check if is a bigram
|
||||
* @param string $lang language of bigrams file
|
||||
* @param string $filter_prefix either the word "segment", "all", or
|
||||
* number n of the number of words in an ngram in filter.
|
||||
* @return true or false
|
||||
*/
|
||||
public static function ngramsContains($phrase, $lang, $filter_prefix = 2)
|
||||
{
|
||||
$lang = str_replace("-", "_", $lang);
|
||||
if (self::$ngrams == null || !isset(self::$ngrams[$filter_prefix])) {
|
||||
$filter_path = C\LOCALE_DIR . "/$lang/resources/" .
|
||||
"{$filter_prefix}" . self::FILTER_SUFFIX;
|
||||
if (file_exists($filter_path)) {
|
||||
self::$ngrams[$filter_prefix] =
|
||||
BloomFilterFile::load($filter_path);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return self::$ngrams[$filter_prefix]->contains(mb_strtolower($phrase));
|
||||
}
|
||||
/**
|
||||
* Creates a bloom filter file from a n word gram text file. The
|
||||
* path of n word gram text file used is based on the input $lang.
|
||||
* The name of output filter file is based on the $lang and the
|
||||
* number n. Size is based on input number of n word grams .
|
||||
* The n word grams are read from text file, stemmed if a stemmer
|
||||
* is available for $lang and then stored in filter file.
|
||||
*
|
||||
* @param string $lang locale to be used to stem n grams.
|
||||
* @param string $num_gram value of n in n-gram (how many words in sequence
|
||||
* should constitute a gram)
|
||||
* @param int $num_ngrams_found count of n word grams in text file.
|
||||
* @param int $max_gram_len value n of longest n gram to be added.
|
||||
* @return none
|
||||
*/
|
||||
public static function makeNWordGramsFilterFile($lang, $num_gram,
|
||||
$num_ngrams_found, $max_gram_len = 2)
|
||||
{
|
||||
$lang = str_replace("-", "_", $lang);
|
||||
$filter_path = C\LOCALE_DIR . "/$lang/resources/" .
|
||||
"{$num_gram}" . self::FILTER_SUFFIX;
|
||||
if (file_exists($filter_path)) {
|
||||
unlink($filter_path); //build again from scratch
|
||||
}
|
||||
$ngrams = new BloomFilterFile($filter_path, $num_ngrams_found);
|
||||
|
||||
$inputFilePath = C\LOCALE_DIR . "/$lang/resources/" .
|
||||
"{$num_gram}" . self::TEXT_SUFFIX;
|
||||
$fp = fopen($inputFilePath, 'r') or die("Can't open ngrams text file");
|
||||
while ( ($ngram = fgets($fp)) !== false) {
|
||||
$words = PhraseParser::stemTerms(trim($ngram), $lang);
|
||||
if (strlen($words[0]) == 1) { // get rid of n grams like "a dog"
|
||||
continue;
|
||||
}
|
||||
$ngram_stemmed = implode(" ", $words);
|
||||
$ngrams->add(mb_strtolower($ngram_stemmed));
|
||||
}
|
||||
fclose($fp);
|
||||
$ngrams->max_gram_len = $max_gram_len;
|
||||
$ngrams->save();
|
||||
}
|
||||
/**
|
||||
* Used to create a filter file suitable for use in word segmentation
|
||||
* (splitting text like "thiscontainsnospaces" into
|
||||
* "this contains no spaces"). Used by @see token_tool.php
|
||||
*
|
||||
* @param string $dict_file file to use as a dictionary to make filter from
|
||||
* @param string $lang locale tag of locale we are building the filter for
|
||||
*/
|
||||
public static function makeSegmentFilterFile($dict_file, $lang)
|
||||
{
|
||||
$lang = str_replace("-", "_", $lang);
|
||||
$filter_path = C\LOCALE_DIR . "/$lang/resources/" .
|
||||
"segment" . self::FILTER_SUFFIX;
|
||||
if (file_exists($filter_path)) {
|
||||
unlink($filter_path); //build again from scratch
|
||||
}
|
||||
$words = file($dict_file);
|
||||
$filter = new BloomFilterFile($filter_path, count($words));
|
||||
foreach ($words as $word) {
|
||||
$tmp = trim($word);
|
||||
$len = mb_strlen($tmp);
|
||||
$filter->add(mb_strtolower($tmp));;
|
||||
for ($i = 1; $i < $len; $i++) {
|
||||
$tmp2 = "*" . mb_substr($tmp, $i, $len, "UTF-8");
|
||||
if ($tmp2 == "*") {continue;}
|
||||
$filter->add(mb_strtolower($tmp2));
|
||||
}
|
||||
}
|
||||
$filter->save();
|
||||
}
|
||||
/**
|
||||
* Generates a n word grams text file from input wikipedia xml file.
|
||||
* The input file can be a bz2 compressed or uncompressed.
|
||||
* The input XML file is parsed line by line and pattern for
|
||||
* n word gram is searched. If a n word gram is found it is added to the
|
||||
* array. After the complete file is parsed we remove the duplicate
|
||||
* n word grams and sort them. The resulting array is written to the
|
||||
* text file. The function returns the number of bigrams stored in
|
||||
* the text file.
|
||||
*
|
||||
* @param string $wiki_file compressed or uncompressed wikipedia
|
||||
* XML file path to be used to extract bigrams. This can also
|
||||
* be a folder containing such files
|
||||
* @param string $lang Language to be used to create n grams.
|
||||
* @param string $locale Locale to be used to store results.
|
||||
* @param int $num_gram number of words in grams we are looking for
|
||||
* @param int $ngram_type where in Wiki Dump to extract grams from
|
||||
* @param int $max_terms maximum number of n-grams to compute and put in
|
||||
* file
|
||||
* @return int $num_ngrams_found count of bigrams in text file.
|
||||
*/
|
||||
public static function makeNWordGramsTextFile($wiki_file, $lang,
|
||||
$locale, $num_gram = 2, $ngram_type = self::PAGE_COUNT_WIKIPEDIA,
|
||||
$max_terms = -1)
|
||||
{
|
||||
$output_message_threshold = self::BLOCK_SIZE * self::BLOCK_SIZE;
|
||||
$is_count_type = false;
|
||||
switch ($ngram_type) {
|
||||
case self::WIKI_DUMP_TITLE:
|
||||
$pattern = '/<title>[^\p{P}]+';
|
||||
$pattern_end = '<\/title>/u';
|
||||
$replace_array = ['<title>','</title>'];
|
||||
break;
|
||||
case self::WIKI_DUMP_REDIRECT:
|
||||
$pattern = '/#redirect\s\[\[[^\p{P}]+';
|
||||
$pattern_end='\]\]/u';
|
||||
$replace_array = ['#redirect [[',']]'];
|
||||
break;
|
||||
case self::PAGE_COUNT_WIKIPEDIA:
|
||||
$pattern = '/^'.$lang.'\s[^\p{P}]+';
|
||||
$pattern_end='/u';
|
||||
$is_count_type = true;
|
||||
break;
|
||||
case self::PAGE_COUNT_WIKTIONARY:
|
||||
$pattern = '/^'.$lang.'.d\s[^\p{P}]+';
|
||||
$pattern_end='/u';
|
||||
$is_count_type = true;
|
||||
break;
|
||||
}
|
||||
$is_all = false;
|
||||
$repeat_pattern = "[\s|_][^\p{P}]+";
|
||||
if ($num_gram == "all" || $is_count_type) {
|
||||
$pattern .= "($repeat_pattern)+";
|
||||
if ($num_gram == "all") {
|
||||
$is_all = true;
|
||||
}
|
||||
$max_gram_len = -1;
|
||||
} else {
|
||||
for ($i = 1; $i < $num_gram; $i++) {
|
||||
$pattern .= $repeat_pattern;
|
||||
}
|
||||
$max_gram_len = $num_gram;
|
||||
}
|
||||
$pattern .= $pattern_end;
|
||||
$replace_types = [self::WIKI_DUMP_TITLE, self::WIKI_DUMP_REDIRECT];
|
||||
|
||||
if (is_dir(C\PREP_DIR."/$wiki_file") ) {
|
||||
$folder_files = glob(C\PREP_DIR."/$wiki_file/*.{gz,bz}",
|
||||
GLOB_BRACE);
|
||||
} else {
|
||||
$folder_files = [C\PREP_DIR."/$wiki_file"];
|
||||
}
|
||||
$ngrams = [];
|
||||
foreach ($folder_files as $wiki_file_path) {
|
||||
if (strpos($wiki_file_path, "bz2") !== false) {
|
||||
$fr = bzopen($wiki_file_path, 'r') or
|
||||
die ("Can't open compressed file");
|
||||
$read = "bzread";
|
||||
$close = "bzclose";
|
||||
} else if (strpos($wiki_file_path, "gz") !== false) {
|
||||
$fr = gzopen($wiki_file_path, 'r') or
|
||||
die ("Can't open compressed file");
|
||||
$read = "gzread";
|
||||
$close = "gzclose";
|
||||
} else {
|
||||
$fr = fopen($wiki_file_path, 'r') or die("Can't open file");
|
||||
$read = "fread";
|
||||
$close = "fclose";
|
||||
}
|
||||
$ngrams_file_path
|
||||
= C\LOCALE_DIR . "/$locale/resources/" . "{$num_gram}" .
|
||||
self::TEXT_SUFFIX;
|
||||
$input_buffer = "";
|
||||
$time = time();
|
||||
echo "Reading wiki file ...$wiki_file_path...\n";
|
||||
$bytes = 0;
|
||||
$bytes_since_last_output = 0;
|
||||
while (!feof($fr)) {
|
||||
$input_text = $read($fr, self::BLOCK_SIZE);
|
||||
$len = strlen($input_text);
|
||||
if ($len == 0) break;
|
||||
$bytes += $len;
|
||||
$bytes_since_last_output += $len;
|
||||
if ($bytes_since_last_output > $output_message_threshold) {
|
||||
echo "Have now read ".$bytes." many bytes." .
|
||||
" Peak memory so far: ".memory_get_peak_usage().
|
||||
".\n Number of word grams so far: ".count($ngrams).
|
||||
". Elapsed time so far: ".(time() - $time)."s\n";
|
||||
$bytes_since_last_output = 0;
|
||||
}
|
||||
$input_buffer .= mb_strtolower($input_text);
|
||||
$lines = explode("\n", $input_buffer);
|
||||
$input_buffer = array_pop($lines);
|
||||
foreach ($lines as $line) {
|
||||
preg_match($pattern, $line, $matches);
|
||||
if (count($matches) > 0) {
|
||||
if ($is_count_type) {
|
||||
$line_parts = explode(" ", $matches[0]);
|
||||
if (isset($line_parts[1]) &&
|
||||
isset($line_parts[2])) {
|
||||
$ngram=mb_ereg_replace("_", " ",$line_parts[1]);
|
||||
$char_grams =
|
||||
PhraseParser::getCharGramsTerm(
|
||||
[$ngram],$locale);
|
||||
$ngram = implode(" ", $char_grams);
|
||||
$ngram_num_words=mb_substr_count($ngram, " ")+1;
|
||||
if (($is_all && $ngram_num_words > 1) ||
|
||||
(!$is_all &&$ngram_num_words == $num_gram)){
|
||||
$ngrams[$ngram] = $line_parts[2];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$ngram = mb_ereg_replace(
|
||||
$replace_array, "", $matches[0]);
|
||||
$ngram = mb_ereg_replace("_", " ", $ngram);
|
||||
|
||||
$ngrams[] = $ngram;
|
||||
}
|
||||
if ($is_all && isset($ngram)) {
|
||||
$ngram_num_words = mb_substr_count($ngram, " ") + 1;
|
||||
$max_gram_len = max($max_gram_len,$ngram_num_words);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($is_count_type) {
|
||||
arsort($ngrams);
|
||||
$ngrams = array_keys($ngrams);
|
||||
}
|
||||
$ngrams = array_unique($ngrams);
|
||||
$num_ngrams_found = count($ngrams);
|
||||
if ($max_terms > 0 && $num_ngrams_found > $max_terms) {
|
||||
$ngrams = array_slice($ngrams, 0, $max_terms);
|
||||
}
|
||||
$num_ngrams_found = count($ngrams);
|
||||
// in is_all case add prefix*'s for (n >= 3)-grams
|
||||
if ($is_all) {
|
||||
for ($i = 0; $i < $num_ngrams_found; $i++) {
|
||||
$ngram_in_word = mb_substr_count($ngrams[$i], " ")+1;
|
||||
if ($ngram_in_word >= 3) {
|
||||
$ngram_parts = explode(" ", $ngrams[$i]);
|
||||
$ngram = $ngram_parts[0];
|
||||
for ($j = 1; $j < $ngram_in_word - 1; $j++ ) {
|
||||
$ngram .= " ".$ngram_parts[$j];
|
||||
$ngrams[] = $ngram."*";
|
||||
}
|
||||
}
|
||||
}
|
||||
$ngrams = array_unique($ngrams);
|
||||
$num_ngrams_found = count($ngrams);
|
||||
}
|
||||
sort($ngrams);
|
||||
$ngrams_string = implode("\n", $ngrams);
|
||||
file_put_contents($ngrams_file_path, $ngrams_string);
|
||||
$close($fr);
|
||||
return [$num_ngrams_found, $max_gram_len];
|
||||
}
|
||||
}
|
63
src/library/Notifier.php
Normal file
63
src/library/Notifier.php
Normal file
|
@ -0,0 +1,63 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* A Notifier is an object which will be notified by a priority queue
|
||||
* when the index in the queue viewed as array of some data item has been
|
||||
* changed.
|
||||
*
|
||||
* A Notifier is notified when the index in the queue viewed as array of some
|
||||
* data item has been changed, this gives the Notifier object the ability to
|
||||
* update its value of the index for that data item. As an example, in the
|
||||
* search engine, the WebQueueBundle class implements Notifier. Web queue
|
||||
* bundles store url together with their weights and allow one to get out the
|
||||
* url of highest weight. This is implemented by storing in a PriorityQueue
|
||||
* keys consisting of hashes of urls (as fixed length) and values consisting of
|
||||
* the weight. Then in a web archive the url and its index in the priority
|
||||
* queue is stored. When the index in the queue changes, the WebQueueBundle's
|
||||
* notify method is called to adjust the index that is stored in the web
|
||||
* archive.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
* @see WebQueueBundle
|
||||
*/
|
||||
interface Notifier
|
||||
{
|
||||
/**
|
||||
* Handles the update of the index of a data item in a queue with respect
|
||||
* to the Notifier object.
|
||||
*
|
||||
* @param int $index the index of a row in a heap-based priority queue
|
||||
* @param mixed $data the data that is stored at that index
|
||||
*/
|
||||
public function notify($index, $data);
|
||||
}
|
553
src/library/PageRuleParser.php
Normal file
553
src/library/PageRuleParser.php
Normal file
|
@ -0,0 +1,553 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Has methods to parse user-defined page rules to apply documents
|
||||
* to be indexed.
|
||||
*
|
||||
* There are two types of statements that a user can define:
|
||||
* command statements and assignment statements
|
||||
*
|
||||
* A command statement takes a key field argument for the page associative array
|
||||
* and does a function call to manipulate that page.
|
||||
* These have the syntax:
|
||||
* addMetaWords(field) ;add the field and field value to the META_WORD
|
||||
* ;array for the page
|
||||
* addKeywordLink(field) ;split the field on a comma, view this as a search
|
||||
* ;keywords => link text association, and add this to
|
||||
* ;the KEYWORD_LINKS array.
|
||||
* setStack(field) ;set which field value should be used as a stack
|
||||
* pushStack(field) ;add the field value for field to the top of stack
|
||||
* popStack(field) ;pop the top of the stack into the field value for
|
||||
* ;field
|
||||
* setOutputFolder(dir) ;if auxiliary output, rather than just to the
|
||||
* ; a yioop index, is being done, then set the folder
|
||||
* ; for this output to be dir
|
||||
* setOutputFormat(format) ;format of auxiliary output either CSV or SQL
|
||||
* ;SQL mean that writeOutput will write an insert
|
||||
* ;statement
|
||||
* setOutputTable(table) ;if output is SQL then what table to use for the
|
||||
* ;insert statements
|
||||
* toArray(field) ;splits field value for field on a comma and
|
||||
* ;assign field value to be the resulting array
|
||||
* toString(field) ;if field value is an array then implode that
|
||||
* ;array using comma and store the result in field
|
||||
* ;value
|
||||
* unset(field) ;unset that field value
|
||||
* writeOutput(field) ;use the contents of field value viewed as an array
|
||||
* ;to fill in the columns of a SQL insert statement
|
||||
* ;or CSV row
|
||||
*
|
||||
* Assignments can either be straight assignments with '=' or concatenation
|
||||
* assignments with '.='. There are the following kinds of values that one
|
||||
* can assign:
|
||||
*
|
||||
* field = some_other_field ; sets $page['field'] = $page['some_other_field']
|
||||
* field = "some_string" ; sets $page['field'] to "some string"
|
||||
* field = /some_regex/replacement_where_dollar_vars_allowed/
|
||||
* ; computes the results of replacing matches to some_regex in
|
||||
* ; $page['field'] with replacement_where_dollar_vars_allowed
|
||||
* field = /some_regex/g ;sets $page['field'] to the array of all matches
|
||||
* ; of some regex in $page['field']
|
||||
*
|
||||
* For each of the above assignments we could have used ".=" instead of "="
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class PageRuleParser implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Used to store parse trees that this parser executes
|
||||
* @var array
|
||||
*/
|
||||
public $rule_trees;
|
||||
/**
|
||||
* If outputting to auxiliary file is being done, the current folder to
|
||||
* use for such output
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $output_folder="";
|
||||
/**
|
||||
* If outputting to auxiliary file is being done, the current file format
|
||||
* to output with (either SQL or CSV)
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $output_format="";
|
||||
|
||||
/**
|
||||
* If outputting to auxiliary file is being done, and the current file
|
||||
* format is SQL then what table to output insert statements for
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $output_table="";
|
||||
/**
|
||||
* Name of field which will be used as a stack for push and popping other
|
||||
* fields values
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $stack;
|
||||
/**
|
||||
* Constructs a PageRuleParser using the supplied page_rules
|
||||
*
|
||||
* @param string $page_rules a sequence of lines with page rules
|
||||
* as described in the class comments
|
||||
*/
|
||||
public function __construct($page_rules = "")
|
||||
{
|
||||
$this->rule_trees = $this->parseRules($page_rules);
|
||||
}
|
||||
/**
|
||||
* Parses a string of pages rules into parse trees that can be executed
|
||||
* later
|
||||
*
|
||||
* @param string $page_rules a sequence of lines with page rules
|
||||
* as described in the class comments
|
||||
* @return array of parse trees which can be executed in sequence
|
||||
*/
|
||||
public function parseRules($page_rules)
|
||||
{
|
||||
$quote_string = '"([^"\\\\]*(\\.[^"\\\\]*)*)"';
|
||||
$blank = '[ \t]';
|
||||
$comment = $blank.'*;[^\n]*';
|
||||
$literal = '\w+';
|
||||
$assignment = '\.?=';
|
||||
$start = '(?:\A|\n)';
|
||||
$end = '(?:\n|\Z)';
|
||||
$sub_or_match_all = '(/[^/\n]+/)(g|([^/\n]*)/)';
|
||||
$command = '(\w+)'."$blank*".'\('."$blank*".'([\w\/]+)'.
|
||||
"$blank*".'\)';
|
||||
$rule =
|
||||
"@(?:$command$blank*($comment)?$end".
|
||||
"|$blank*($literal)$blank*($assignment)$blank*".
|
||||
"((".$quote_string.")|($literal)|($sub_or_match_all))".
|
||||
"$blank*($comment)?$end)@";
|
||||
$matches = [];
|
||||
preg_match_all($rule, $page_rules, $matches);
|
||||
$rule_trees = [];
|
||||
if (!isset($matches[0]) ||
|
||||
($num_rules = count($matches[0])) == 0) { return $rule_trees; }
|
||||
for ($i = 0; $i < $num_rules; $i++) {
|
||||
$tree = [];
|
||||
if ($matches[1][$i] != "" || $matches[3][$i] != "") {
|
||||
$tree["func_call"] = $matches[1][$i];
|
||||
if (isset($matches[2][$i])) {
|
||||
$tree["arg"] = $matches[2][$i];
|
||||
} else if (isset($matches[4][$i])) {
|
||||
$tree["arg"] = $matches[4][$i];
|
||||
} else {
|
||||
$tree["arg"] = "";
|
||||
}
|
||||
} else {
|
||||
$tree["var"] = $matches[4][$i];
|
||||
$tree["assign_op"] = $matches[5][$i];
|
||||
$value_type_indicator = $matches[6][$i][0];
|
||||
if ($value_type_indicator == '"') {
|
||||
$tree["value_type"] = "string";
|
||||
$tree["value"] = $matches[8][$i];
|
||||
} else if ($value_type_indicator == '/') {
|
||||
if (substr($matches[6][$i],-1) == "g") {
|
||||
$tree["value_type"] = "match_all";
|
||||
} else {
|
||||
$tree["value_type"] = "substitution";
|
||||
}
|
||||
$tree["value"] = [$matches[12][$i], $matches[13][$i]];
|
||||
} else {
|
||||
$tree["value_type"] = "literal";
|
||||
$tree["value"] = $matches[10][$i];
|
||||
}
|
||||
}
|
||||
$rule_trees[] = $tree;
|
||||
}
|
||||
return $rule_trees;
|
||||
}
|
||||
/**
|
||||
* Executes either the internal $rule_trees or the passed $rule_trees
|
||||
* on the provided $page_data associative array
|
||||
*
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record (will be changed by this operation)
|
||||
* @param array $rule_trees an array of annotated syntax trees to
|
||||
* for rules used to update $page_data
|
||||
*/
|
||||
public function executeRuleTrees(&$page_data, $rule_trees = null)
|
||||
{
|
||||
if ($rule_trees == null) {
|
||||
$rule_trees = & $this->rule_trees;
|
||||
}
|
||||
foreach ($rule_trees as $tree) {
|
||||
if (isset($tree['func_call'])) {
|
||||
$this->executeFunctionRule($tree, $page_data);
|
||||
} else {
|
||||
$this->executeAssignmentRule($tree, $page_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to execute a single command rule on $page_data
|
||||
*
|
||||
* @param array $tree annotated syntax tree of a function call rule
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record (will be changed by this operation)
|
||||
*/
|
||||
public function executeFunctionRule($tree, &$page_data)
|
||||
{
|
||||
$allowed_functions = ["addMetaWord" => "addMetaWord",
|
||||
"addKeywordLink" => "addKeywordLink",
|
||||
"setOutputFolder" => "setOutputFolder",
|
||||
"setOutputFormat" => "setOutputFormat",
|
||||
"setOutputTable" => "setOutputTable",
|
||||
"setStack" => "setStack",
|
||||
"pushStack" => "pushStack",
|
||||
"popStack" => "popStack",
|
||||
"toArray" => "toArray",
|
||||
"toString" => "toString",
|
||||
"unset" => "unsetVariable",
|
||||
"writeOutput" => "writeOutput"
|
||||
];
|
||||
if (in_array($tree['func_call'], array_keys($allowed_functions))) {
|
||||
$func = $allowed_functions[$tree['func_call']];
|
||||
$this->$func($tree['arg'], $page_data);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Used to execute a single assignment rule on $page_data
|
||||
*
|
||||
* @param array $tree annotated syntax tree of an assignment rule
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record (will be changed by this operation)
|
||||
*/
|
||||
public function executeAssignmentRule($tree, &$page_data)
|
||||
{
|
||||
$field = $this->getVarField($tree["var"]);
|
||||
if (!isset($page_data[$field])) {
|
||||
$page_data[$field] = "";
|
||||
}
|
||||
$value = "";
|
||||
switch ($tree['value_type']) {
|
||||
case "literal":
|
||||
$literal = $this->getVarField($tree["value"]);
|
||||
if (isset($page_data[$literal])) {
|
||||
$value = $page_data[$literal];
|
||||
}
|
||||
break;
|
||||
case "string":
|
||||
$value = $tree["value"];
|
||||
break;
|
||||
case "substitution":
|
||||
$value = preg_replace($tree["value"][0], $tree["value"][1],
|
||||
$page_data[$field]);
|
||||
break;
|
||||
case "match_all":
|
||||
preg_match_all($tree["value"][0], $tree["value"][1],
|
||||
$page_data[$field], $value);
|
||||
break;
|
||||
}
|
||||
if ($tree["assign_op"] == "=") {
|
||||
$page_data[$field] = $value;
|
||||
} else {
|
||||
$page_data[$field] .= $value;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Either returns $var_name or the value of the CrawlConstant with name
|
||||
* $var_name.
|
||||
*
|
||||
* @param string $var_name field to look up
|
||||
* @return string looked up value
|
||||
*/
|
||||
public function getVarField($var_name)
|
||||
{
|
||||
if (defined("CrawlConstants::$var_name")) {
|
||||
return constant("CrawlConstants::$var_name");
|
||||
}
|
||||
return $var_name;
|
||||
}
|
||||
/**
|
||||
* Adds a meta word u:$field:$page_data[$field_name] to the array
|
||||
* of meta words for this page
|
||||
*
|
||||
* @param $field the key in $page_data to use
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function addMetaWord($field, &$page_data)
|
||||
{
|
||||
$field_name = $this->getVarField($field);
|
||||
if (!isset($page_data[$field_name])) {return; }
|
||||
$meta_word = "u:$field_name:{$page_data[$field_name]}";
|
||||
if (!isset($page_data[CrawlConstants::META_WORDS])) {
|
||||
$page_data[CrawlConstants::META_WORDS] = [];
|
||||
}
|
||||
$page_data[CrawlConstants::META_WORDS][] = $meta_word;
|
||||
}
|
||||
/**
|
||||
* Adds a $keywords => $link_text pair to the KEYWORD_LINKS array fro
|
||||
* this page based on the value $field on the page. The pair is extracted
|
||||
* by splitting on comma. The KEYWORD_LINKS array can be used when
|
||||
* a cached version of a page is displayed to show a list of links
|
||||
* from the cached page in the header. These links correspond to search
|
||||
* in Yioop. for example the value:
|
||||
* madonna, rock star
|
||||
* would add a link to the top of the cache page with text "rock star"
|
||||
* which when clicked would perform a Yioop search on madonna.
|
||||
*
|
||||
* @param $field the key in $page_data to use
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function addKeywordLink($field, &$page_data)
|
||||
{
|
||||
$field_name = $this->getVarField($field);
|
||||
if (!isset($page_data[$field_name])) {return; }
|
||||
$link_parts = explode(",", $page_data[$field_name]);
|
||||
if (count($link_parts) < 2) {return; }
|
||||
list($key_words, $link_text) = $link_parts;
|
||||
if (!isset($page_data[CrawlConstants::KEYWORD_LINKS])) {
|
||||
$page_data[CrawlConstants::KEYWORD_LINKS] = [];
|
||||
}
|
||||
$page_data[CrawlConstants::KEYWORD_LINKS][$key_words] = $link_text;
|
||||
}
|
||||
/**
|
||||
* Set field variable to be used as a stack
|
||||
*
|
||||
* @param $field what field variable to use for current stack
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function setStack($field, &$page_data)
|
||||
{
|
||||
$this->stack = $this->getVarField($field);
|
||||
if (!isset($page_data[$this->stack]) ||
|
||||
(!is_string($page_data[$this->stack]) &&
|
||||
!is_array($page_data[$this->stack]) )) {
|
||||
$page_data[$this->stack] = [];
|
||||
} else if (is_string($page_data[$this->stack])) {
|
||||
$page_data[$this->stack] = [$page_data[$this->stack]];
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Pushes an element or items in an array stored in field onto the current
|
||||
* stack
|
||||
*
|
||||
* @param $field what field to get data to push onto fcurrent stack
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function pushStack($field, &$page_data)
|
||||
{
|
||||
$var_field = $this->getVarField($field);
|
||||
if (!isset($page_data[$this->stack]) || !isset($page_data[$var_field])
|
||||
|| (!is_string($page_data[$var_field])
|
||||
&& !is_array($page_data[$var_field])) ) {
|
||||
return;
|
||||
}
|
||||
if (is_string($page_data[$var_field])) {
|
||||
$page_data[$this->stack][] = $page_data[$var_field];
|
||||
} else {
|
||||
$this->stack = array_merge($page_data[$this->stack],
|
||||
$page_data[$var_field]);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Pop an element or items in an array stored in field onto the current
|
||||
* stack
|
||||
*
|
||||
* @param $field what field to get data to push onto fcurrent stack
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function popStack($field, &$page_data)
|
||||
{
|
||||
$var_field = $this->getVarField($field);
|
||||
if (!isset($page_data[$this->stack]) ) {
|
||||
return;
|
||||
}
|
||||
$page_data[$var_field] = array_pop($page_data[$this->stack]);
|
||||
}
|
||||
/**
|
||||
* Set output folder
|
||||
*
|
||||
* @param $dir output directory in which to write data.txt files containing
|
||||
* the contents of some fields after writeOutput commands
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function setOutputFolder($dir, &$page_data)
|
||||
{
|
||||
$this->output_folder = realpath(trim($dir));
|
||||
}
|
||||
/**
|
||||
* Set output format
|
||||
*
|
||||
* @param $format can be either csv or sql
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function setOutputFormat($format, &$page_data)
|
||||
{
|
||||
if (in_array($format, ["csv", "sql"])) {
|
||||
$this->output_format = $format;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Set output table
|
||||
*
|
||||
* @param $table table to use if output format is sql
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function setOutputTable($table, &$page_data)
|
||||
{
|
||||
$this->output_table = $table;
|
||||
}
|
||||
/**
|
||||
* If $page_data[$field] is a string, splits it into an array on comma,
|
||||
* trims leading and trailing spaces from each item and stores the result
|
||||
* back into $page_data[$field]
|
||||
*
|
||||
*
|
||||
* @param $field the key in $page_data to use
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function toArray($field, &$page_data)
|
||||
{
|
||||
$var_field = $this->getVarField($field);
|
||||
if (is_string($page_data[$var_field])) {
|
||||
$field_parts = explode(",", $page_data[$var_field]);
|
||||
$page_data[$var_field] = [];
|
||||
foreach ($field_parts as $part) {
|
||||
$page_data[$var_field][] = trim($part);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* If $page_data[$field] is an array, implode it into a string on comma,
|
||||
* and stores the result back into $page_data[$field]
|
||||
*
|
||||
* @param $field the key in $page_data to use
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function toString($field, &$page_data)
|
||||
{
|
||||
$var_field = $this->getVarField($field);
|
||||
if (is_array($page_data[$var_field])) {
|
||||
$page_data[$var_field] = implode(",", $page_data[$var_field]);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Unsets the key $field (or the crawl constant it corresponds to)
|
||||
* in $page_data. If it is a crawlconstant it doesn't unset it --
|
||||
* it just sets it to the empty string
|
||||
*
|
||||
* @param $field the key in $page_data to use
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function unsetVariable($field, &$page_data)
|
||||
{
|
||||
$var_field = $this->getVarField($field);
|
||||
if ($var_field == $field) {
|
||||
unset($page_data[$var_field]);
|
||||
} else {
|
||||
$page_data[$var_field] = "";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Write the value of a field to the output folder in the current
|
||||
* format. If the field is not set nothing is written
|
||||
*
|
||||
* @param $field the key in $page_data to use
|
||||
* @param array& $page_data an associative array of containing summary
|
||||
* info of a web page/record
|
||||
*/
|
||||
public function writeOutput($field, &$page_data)
|
||||
{
|
||||
$var_field = $this->getVarField($field);
|
||||
if (isset($page_data[$var_field]) && $this->output_folder) {
|
||||
$data_file = "{$this->output_folder}/data.txt";
|
||||
if (file_exists($data_file) &&
|
||||
filesize($data_file) > C\MAX_LOG_FILE_SIZE) {
|
||||
clearstatcache(); //hopefully, this doesn't slow things too much
|
||||
$data_files = glob("$data_file.*.gz");
|
||||
$num_data_files = count($data_files);
|
||||
file_put_contents("$data_file.$num_data_files.gz",
|
||||
gzcompress(file_get_contents($data_file)));
|
||||
unlink($data_file);
|
||||
}
|
||||
$out = $page_data[$var_field];
|
||||
if (!$out) {return; }
|
||||
if (!is_array($out)) {
|
||||
$out = [$out];
|
||||
}
|
||||
$fh = fopen($data_file, "a");
|
||||
if (!$fh) {return; }
|
||||
switch ($this->output_format) {
|
||||
case 'csv':
|
||||
fputcsv($fh, $out);
|
||||
break;
|
||||
case 'sql':
|
||||
if (!$this->output_table) {break; }
|
||||
$sql = "INSERT INTO {$this->output_table} ";
|
||||
if (isset($out[0])) {
|
||||
$sql .= " VALUES(";
|
||||
} else {
|
||||
$keys = array_keys($out);
|
||||
$sql .= '(';
|
||||
foreach ($keys as $key) {
|
||||
$sql .= "$comma $key";
|
||||
$comma = ",";
|
||||
}
|
||||
$sql .= ') VALUES(';
|
||||
}
|
||||
$comma = "";
|
||||
foreach ($out as $value) {
|
||||
$sql .= "$comma '". addslashes($value)."'";
|
||||
$comma = ",";
|
||||
}
|
||||
$sql .= ");\n";
|
||||
fwrite($fh, $sql);
|
||||
break;
|
||||
}
|
||||
fclose($fh);
|
||||
}
|
||||
}
|
||||
}
|
155
src/library/PartialZipArchive.php
Normal file
155
src/library/PartialZipArchive.php
Normal file
|
@ -0,0 +1,155 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* Used to extract files from an initial segment or a fragment of a
|
||||
* ZIP Archive.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class PartialZipArchive
|
||||
{
|
||||
/**
|
||||
* Stores path/filename -> (compression type, compressed file) associations
|
||||
* for all files in the archive that were extractable from the given
|
||||
* zip archive fragment
|
||||
* @var array
|
||||
*/
|
||||
public $zip_directory = [];
|
||||
/**
|
||||
* Stores path/filenames that were discovered in the initial segment of
|
||||
* this zip archive
|
||||
* @var array
|
||||
*/
|
||||
public $zip_file_names = [];
|
||||
/** ZIP code to indicate compression type is no compression used*/
|
||||
const NO_COMPRESSION = 0;
|
||||
/** ZIP code to indicate compression type is deflate*/
|
||||
const DEFLATE = 8;
|
||||
/** ZIP code to indicate compression type is enhanced deflate (4gb barrier
|
||||
* passable)
|
||||
*/
|
||||
const ENHANCED_DEFLATE = 9;
|
||||
/** Byte string to indicate start of a local file header, used to find
|
||||
* locations of all the files stored in ZIP fragment we have
|
||||
*/
|
||||
const LOCAL_FILE_HEADER = "\x50\x4B\x03\x04";
|
||||
/**
|
||||
* Sets up a PartialZipArchive so that files can be extracted from it.
|
||||
* To this it populates the two field variables @see $zip_directory
|
||||
* and @see $zip_file_names. Offsets used in the code for extracting
|
||||
* various fields out of a zip archive local file header were gotten
|
||||
* from https://en.wikipedia.org/wiki/ZIP_%28file_format%29
|
||||
* Note the code for the constructor justs splits the whole string into
|
||||
* parts on the string @see LOCAL_FILE_HEADER. It doesn't bother to try
|
||||
* to use the zip archive's directory (which might not be in the portion
|
||||
* of this zip archive given). It is possible for a file contained
|
||||
* in archive to actual have within it the string LOCAL_FILE_HEADER, in
|
||||
* which case that file would be screwed up by our approach.
|
||||
*
|
||||
* @param string $zip_string a substring of a zip archive file
|
||||
*/
|
||||
public function __construct($zip_string)
|
||||
{
|
||||
$sub_files = explode(self::LOCAL_FILE_HEADER, $zip_string);
|
||||
$sub_files = array_filter($sub_files);
|
||||
$num_sub_files = count($sub_files);
|
||||
foreach ($sub_files as $sub_file) {
|
||||
if (!$sub_file) { continue; }
|
||||
$len_string = substr($sub_file, 22, 2);
|
||||
$file_name_len = (ord($len_string[1]) << 8) + ord($len_string[0]);
|
||||
$len_string = substr($sub_file, 24, 2);
|
||||
$extra_field_len = (ord($len_string[1]) << 8) + ord($len_string[0]);
|
||||
$file_start = 26 + $file_name_len + $extra_field_len;
|
||||
$len_string = substr($sub_file, 14, 4);
|
||||
$file_size = (((((ord($len_string[3]) << 8) +
|
||||
ord($len_string[2])) << 8) + ord($len_string[1])) << 8) +
|
||||
ord($len_string[0]);
|
||||
$file_name = substr($sub_file, 26, $file_name_len);
|
||||
$file_string = substr($sub_file, $file_start, $file_size);
|
||||
if (strlen($file_string) < $file_size) {continue; }
|
||||
$compression = ord($sub_file[4]);
|
||||
if ($file_name && $file_string) {
|
||||
$this->zip_directory[$file_name] = [$compression, $file_string];
|
||||
$this->zip_file_names[] = $file_name;
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Returns the total number of files that were detected in the zip archive
|
||||
* fragment.
|
||||
*
|
||||
* @return int number of files found in archive
|
||||
*/
|
||||
public function numFiles()
|
||||
{
|
||||
return count($this->zip_file_names);
|
||||
}
|
||||
/**
|
||||
* Returns the file name for the ith file that was extractable from
|
||||
* the archive string used in the constructor.
|
||||
*
|
||||
* @param int $index the number of file want
|
||||
* @return string its corresponding file name
|
||||
*/
|
||||
public function getNameIndex($index)
|
||||
{
|
||||
if (isset($this->zip_file_names[$index])) {
|
||||
return $this->zip_file_names[$index];
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Returns from the PartialZipArchive the uncompressed contents of
|
||||
* the provided path/filename if found, and false otherwise.
|
||||
*
|
||||
* @param string $file_name contains complete path and file_name of afile
|
||||
* @return mixed uncompressed file contents if found and extractable,
|
||||
* false otherwise
|
||||
*/
|
||||
public function getFromName($file_name)
|
||||
{
|
||||
if (!isset($this->zip_directory[$file_name])) {return false; }
|
||||
list($compression, $file_string) = $this->zip_directory[$file_name];
|
||||
switch ($compression)
|
||||
{
|
||||
case self::NO_COMPRESSION:
|
||||
return $file_string;
|
||||
break;
|
||||
case self::DEFLATE:
|
||||
case self::ENHANCED_DEFLATE:
|
||||
return gzinflate($file_string);
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
121
src/library/PersistentStructure.php
Normal file
121
src/library/PersistentStructure.php
Normal file
|
@ -0,0 +1,121 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* A PersistentStructure is a data structure which every so many operations
|
||||
* will be saved to secondary storage (such as disk).
|
||||
* An operation occurs whenever the PersistentStructure's checkSave method is
|
||||
* called. A PersistentStructure also supports the ability to be load
|
||||
* (read in from) secondary storage.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class PersistentStructure
|
||||
{
|
||||
/** If not specified in the constructor, this will be the number of
|
||||
* operations between saves
|
||||
* @var int
|
||||
*/
|
||||
const DEFAULT_SAVE_FREQUENCY = 50000;
|
||||
/** Name of the file in which to store the PersistentStructure
|
||||
* @var string
|
||||
*/
|
||||
public $filename;
|
||||
/** Number of operations since the last save
|
||||
* @var int
|
||||
*/
|
||||
public $unsaved_operations;
|
||||
/** Number of operation between saves. If == -1 never save using checkSave
|
||||
* @var int
|
||||
*/
|
||||
public $save_frequency;
|
||||
|
||||
/**
|
||||
* Sets up the file name and save frequency for the PersistentStructure,
|
||||
* initializes the oepration count
|
||||
*
|
||||
* @param string $fname the name of the file to store the
|
||||
* PersistentStructure in
|
||||
* @param int $save_frequency the number of operation before a save If
|
||||
* <= 0 never check save
|
||||
*/
|
||||
public function __construct($fname,
|
||||
$save_frequency = self::DEFAULT_SAVE_FREQUENCY)
|
||||
{
|
||||
$this->filename = $fname;
|
||||
$this->save_frequency = $save_frequency;
|
||||
$this->unsaved_operations = 0;
|
||||
}
|
||||
/**
|
||||
* Load a PersistentStructure from a file
|
||||
*
|
||||
* @param string $fname the name of the file to load the
|
||||
* PersistentStructure from
|
||||
* @return object the PersistentStructure loaded
|
||||
*/
|
||||
public static function load($fname)
|
||||
{
|
||||
/* code to handle the fact that name space of object may not be the
|
||||
modern nameepace
|
||||
*/
|
||||
$obj_string = file_get_contents($fname);
|
||||
$name_length = intval(substr($obj_string, 2, 14));
|
||||
$name_space_info_length = strlen("O:".$name_length.":") + $name_length
|
||||
+ 2; // 2 for quotes;
|
||||
$actual_name = get_called_class();
|
||||
$obj_string = 'O:' . strlen($actual_name) . ':"'.$actual_name.'"' .
|
||||
substr($obj_string, $name_space_info_length);
|
||||
return unserialize($obj_string);
|
||||
}
|
||||
/**
|
||||
* Save the PersistentStructure to its filename
|
||||
* This method is generic but super memory inefficient, so reimplement
|
||||
* for subclasses is needed
|
||||
*/
|
||||
public function save()
|
||||
{
|
||||
file_put_contents($this->filename, serialize($this));
|
||||
}
|
||||
/**
|
||||
* Add one to the unsaved_operations count. If this goes above the
|
||||
* save_frquency then save the PersistentStructure to secondary storage
|
||||
*/
|
||||
public function checkSave()
|
||||
{
|
||||
$this->unsaved_operations++;
|
||||
if ($this->save_frequency > 0 &&
|
||||
$this->unsaved_operations >= $this->save_frequency) {
|
||||
$this->save();
|
||||
$this->unsaved_operations = 0;
|
||||
}
|
||||
}
|
||||
}
|
1368
src/library/PhraseParser.php
Normal file
1368
src/library/PhraseParser.php
Normal file
File diff suppressed because it is too large
Load diff
382
src/library/PriorityQueue.php
Normal file
382
src/library/PriorityQueue.php
Normal file
|
@ -0,0 +1,382 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/**
|
||||
* Loaded for crawlLog function
|
||||
*/
|
||||
require_once __DIR__."/Utility.php";
|
||||
/**
|
||||
*
|
||||
* Code used to manage a memory efficient priority queue.
|
||||
* Weights for the queue must be flaots. The queue itself is
|
||||
* implemented using heaps
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class PriorityQueue extends StringArray implements CrawlConstants
|
||||
{
|
||||
/**
|
||||
* Number of values that can be stored in the priority queue
|
||||
* @var int
|
||||
*/
|
||||
public $num_values;
|
||||
/**
|
||||
* Number of bytes needed to store a value associated with a weight
|
||||
* @var int
|
||||
*/
|
||||
public $value_size;
|
||||
/**
|
||||
* Number of bytes needed to store a weight in the queue
|
||||
* @var int
|
||||
*/
|
||||
public $weight_size = 4; //size of a float
|
||||
/**
|
||||
* Number of items that are currently stored in the queue
|
||||
* @var int
|
||||
*/
|
||||
public $count;
|
||||
/**
|
||||
* When the polling the queue returns the least or most weighted value
|
||||
* @var string
|
||||
*/
|
||||
public $min_or_max;
|
||||
/**
|
||||
* An object that implements the Notifier interface (for instance,
|
||||
* WebQueueArchive)
|
||||
* @var object
|
||||
*/
|
||||
public $notifier; // who to call if move an item in queue
|
||||
/**
|
||||
* Makes a priority queue (implemented as an array heap) with the given
|
||||
* operating parameters
|
||||
*
|
||||
* @param string $fname filename to store the data associated with the queue
|
||||
* @param int $num_values number of values the queue can hold
|
||||
* @param int $value_size the size in a bytes of a value
|
||||
* @param string $min_or_max whether this priority queue return least or
|
||||
* most weight values when polled
|
||||
* @param object $notifier object to call when a value changes in the queue
|
||||
* @param int $save_frequency how often the data in the queue should be
|
||||
* save to disk. (It's default location is RAM)
|
||||
*/
|
||||
public function __construct($fname, $num_values, $value_size,
|
||||
$min_or_max, $notifier = null,
|
||||
$save_frequency = self::DEFAULT_SAVE_FREQUENCY)
|
||||
{
|
||||
$this->num_values = $num_values;
|
||||
$this->value_size = $value_size;
|
||||
$this->min_or_max = $min_or_max;
|
||||
$this->count = 0;
|
||||
$this->notifier = $notifier;
|
||||
parent::__construct($fname, $num_values,
|
||||
$value_size + $this->weight_size, $save_frequency);
|
||||
}
|
||||
/**
|
||||
* Gets the data stored at the ith location in the priority queue
|
||||
*
|
||||
* @param int $i location to return data from
|
||||
* @return mixed array data if the value of $i is between 1 and count, false
|
||||
* otherwise
|
||||
*/
|
||||
public function peek($i = 1)
|
||||
{
|
||||
if ($i < 1 || $i > $this->count) {
|
||||
crawlLog("Peek Index $i not in Range [1, {$this->count}]");
|
||||
return false;
|
||||
}
|
||||
return $this->getRow($i);
|
||||
}
|
||||
/**
|
||||
* Removes and returns the ith element out of the Priority queue.
|
||||
* Since this is a priority queue the first element in the queue
|
||||
* will either be the min or max (depending on queue type) element
|
||||
* stored. If $i is not in range an error message is written to the log.
|
||||
* This operation also performs a check to see if the queue should be
|
||||
* saved to disk
|
||||
*
|
||||
* @param int $i element to get out of the queue
|
||||
* @return mixed array data if the value of $i is between 1 and count, false
|
||||
* otherwise
|
||||
*/
|
||||
public function poll($i = 1)
|
||||
{
|
||||
if ($i < 1 || $i > $this->count) {
|
||||
crawlLog("Index $i not in Range [1, {$this->count}]");
|
||||
return false;
|
||||
}
|
||||
$extreme = $this->peek($i);
|
||||
$last_entry = $this->getRow($this->count);
|
||||
$this->putRow($i, $last_entry);
|
||||
$this->count--;
|
||||
$this->percolateDown($i);
|
||||
$this->checkSave();
|
||||
return $extreme;
|
||||
}
|
||||
/**
|
||||
* Inserts a new item into the priority queue.
|
||||
*
|
||||
* @param string $data what to insert into the queue
|
||||
* @param float $weight how much the new data should be weighted
|
||||
* @return mixed index location in queue where item was stored if
|
||||
* successful, otherwise false.
|
||||
*/
|
||||
public function insert($data, $weight)
|
||||
{
|
||||
if ($this->count == $this->num_values) {
|
||||
return false;
|
||||
}
|
||||
$this->count++;
|
||||
$cur = $this->count;
|
||||
$this->putRow($cur, [$data, $weight]);
|
||||
$loc = $this->percolateUp($cur);
|
||||
return $loc;
|
||||
}
|
||||
/**
|
||||
* Add $delta to the $ith element in the priority queue and then adjusts
|
||||
* the queue to store the heap property
|
||||
*
|
||||
* @param int $i element whose weight should be adjusted
|
||||
* @param float $delta how much to change the weight by
|
||||
*/
|
||||
public function adjustWeight($i, $delta)
|
||||
{
|
||||
if ( ($tmp = $this->peek($i)) === false) {
|
||||
crawlLog("Index $i not in queue adjust weight failed");
|
||||
return false;
|
||||
}
|
||||
list($data, $old_weight) = $tmp;
|
||||
$new_weight = $old_weight + $delta;
|
||||
$this->putRow($i, [$data, $new_weight]);
|
||||
if ($new_weight > $old_weight) {
|
||||
if ($this->min_or_max == self::MIN) {
|
||||
$this->percolateDown($i);
|
||||
} else {
|
||||
$this->percolateUp($i);
|
||||
}
|
||||
} else {
|
||||
if ($this->min_or_max == self::MAX) {
|
||||
$this->percolateDown($i);
|
||||
} else {
|
||||
$this->percolateUp($i);
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Pretty prints the contents of the queue viewed as an array.
|
||||
*
|
||||
*/
|
||||
public function printContents()
|
||||
{
|
||||
for ($i = 1; $i <= $this->count; $i++) {
|
||||
$row = $this->peek($i);
|
||||
print "Entry: $i Value: ".$row[0]." Weight: ".$row[1]."\n";
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Return the contents of the priority queue as an array of
|
||||
* value weight pairs.
|
||||
*
|
||||
* @return array contents of the queue
|
||||
*/
|
||||
public function getContents()
|
||||
{
|
||||
$rows = [];
|
||||
for ($i = 1; $i <= $this->count; $i++) {
|
||||
$rows[] = $this->peek($i);
|
||||
}
|
||||
return $rows;
|
||||
}
|
||||
/**
|
||||
* Scaless the weights of elements in the queue so that the sum fo the new
|
||||
* weights is $new_total
|
||||
*
|
||||
* This function is used periodically to prevent the queue from being
|
||||
* gummed up because all of the weights stored in it are too small.
|
||||
*
|
||||
* @param int $new_total what the new sum of weights of elements in the
|
||||
* queue will be after normalization
|
||||
*/
|
||||
public function normalize($new_total = C\NUM_URLS_QUEUE_RAM)
|
||||
{
|
||||
$count = $this->count;
|
||||
$total_weight = $this->totalWeight();
|
||||
|
||||
if ($total_weight <= 0) {
|
||||
crawlLog(
|
||||
"Total queue weight was zero!! Doing uniform renormalization!");
|
||||
}
|
||||
for ($i = 1; $i <= $count; $i++) {
|
||||
$row = $this->getRow($i);
|
||||
if ($total_weight > 0) {
|
||||
$row[1] = ($new_total*$row[1])/$total_weight;
|
||||
} else {
|
||||
$row[1] = $new_total/$count;
|
||||
}
|
||||
$this->putRow($i, $row);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* If the $ith element in the PriorityQueue violates the heap
|
||||
* property with its parent node (children should be of lower
|
||||
* priority than the parent), this function
|
||||
* tries modify the heap to restore the heap property.
|
||||
*
|
||||
* @param int $i node to consider in restoring the heap property
|
||||
* @return int final position $ith node ends up at
|
||||
*/
|
||||
public function percolateUp($i)
|
||||
{
|
||||
if ($i <= 1) return $i;
|
||||
$start_row = $this->getRow($i);
|
||||
$parent = $i;
|
||||
while ($parent > 1) {
|
||||
$child = $parent;
|
||||
$parent = floor($parent/2);
|
||||
$row = $this->getRow($parent);
|
||||
if ($this->compare($row[1], $start_row[1]) < 0) {
|
||||
$this->putRow($child, $row);
|
||||
} else {
|
||||
$this->putRow($child, $start_row);
|
||||
return $child;
|
||||
}
|
||||
}
|
||||
$this->putRow(1, $start_row);
|
||||
return 1;
|
||||
}
|
||||
/**
|
||||
* If the ith element in the PriorityQueue violates the heap
|
||||
* property with some child node (children should be of lower
|
||||
* priority than the parent), this function
|
||||
* tries modify the heap to restore the heap property.
|
||||
*
|
||||
* @param int $i node to consider in restoring the heap property
|
||||
*/
|
||||
public function percolateDown($i)
|
||||
{
|
||||
$start_row = $this->getRow($i);
|
||||
$count = $this->count;
|
||||
$parent = $i;
|
||||
$child = 2*$parent;
|
||||
while ($child <= $count) {
|
||||
$left_child_row = $this->getRow($child);
|
||||
if ($child < $count) { // this 'if' checks if there is a right child
|
||||
$right_child_row = $this->getRow($child + 1);
|
||||
if ($this->compare(
|
||||
$left_child_row[1], $right_child_row[1]) < 0) {
|
||||
$child++;
|
||||
}
|
||||
}
|
||||
$child_row = $this->getRow($child);
|
||||
if ($this->compare($start_row[1], $child_row[1]) < 0) {
|
||||
$this->putRow($parent, $child_row);
|
||||
} else {
|
||||
$this->putRow($parent, $start_row);
|
||||
return;
|
||||
}
|
||||
$parent = $child;
|
||||
$child = 2 * $parent;
|
||||
}
|
||||
$this->putRow($parent, $start_row);
|
||||
}
|
||||
/**
|
||||
* Computes the difference of the two values $value1 and $value2
|
||||
*
|
||||
* Which is subtracted from which is determined by whether this is
|
||||
* a min_or_max priority queue
|
||||
*
|
||||
* @param float $value1 a value to take the difference between
|
||||
* @param float $value2 the other value
|
||||
* @return float the differences
|
||||
*/
|
||||
public function compare($value1, $value2)
|
||||
{
|
||||
if ($this->min_or_max == self::MIN) {
|
||||
return $value2 - $value1;
|
||||
} else {
|
||||
return $value1 - $value2;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Gets the ith element of the PriorityQueue viewed as an array
|
||||
*
|
||||
* @param int $i element to get
|
||||
* @return array value stored in queue together with its weight as a two
|
||||
* element array
|
||||
*/
|
||||
public function getRow($i)
|
||||
{
|
||||
$value_size = $this->value_size;
|
||||
$weight_size = $this->weight_size;
|
||||
$row = $this->get($i);
|
||||
$value = substr($row, 0, $value_size);
|
||||
$pre_weight = substr($row, $value_size, $weight_size);
|
||||
$weight_array = unpack("f", $pre_weight);
|
||||
$weight = $weight_array[1];
|
||||
return [$value, $weight];
|
||||
}
|
||||
/**
|
||||
* Add data to the $i row of the priority queue viewed as an array
|
||||
* Calls the notifier associated with this queue about the change
|
||||
* in data's location
|
||||
*
|
||||
* @param int $i location to add data
|
||||
* @param array $row data to add (a two element array in the form
|
||||
* key, float value).
|
||||
*/
|
||||
public function putRow($i, $row)
|
||||
{
|
||||
$raw_data = $row[0].pack("f", $row[1]);
|
||||
$this->put($i, $raw_data);
|
||||
if ($this->notifier != null) {
|
||||
$this->notifier->notify($i, $row);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Computes and returns the weight of all items in prority queue
|
||||
*
|
||||
* @return float weight of all items stored in the priority queue
|
||||
*/
|
||||
public function totalWeight()
|
||||
{
|
||||
$count = $this->count;
|
||||
$total_weight = 0;
|
||||
for ($i = 1; $i <= $count; $i++) {
|
||||
$row = $this->getRow($i);
|
||||
$total_weight += $row[1];
|
||||
}
|
||||
return $total_weight;
|
||||
}
|
||||
|
||||
|
||||
}
|
185
src/library/ScraperManager.php
Normal file
185
src/library/ScraperManager.php
Normal file
|
@ -0,0 +1,185 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Charles Bocage (charles.bocage@sjsu.edu)
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
/**
|
||||
* Class used by html processors to detect if a page matches a particular
|
||||
* signature such as that of a content management system, and
|
||||
* also to provide scraping mechanisms for the content of such a page
|
||||
*
|
||||
* @author Charles Bocage (charles.bocage@sjsu.edu)
|
||||
*/
|
||||
class ScraperManager
|
||||
{
|
||||
/**
|
||||
* Method used to check a page against a supplied list of scrapers
|
||||
* for a matching signature. If a match is found that scraper is returned.
|
||||
*
|
||||
* @param string $page the html page to check
|
||||
* @param array $scrapers an array of scrapers to check against
|
||||
* @return array an associative array of scraper properties if a matching
|
||||
* scraper signature found; otherwise, the empty array
|
||||
*/
|
||||
public static function getScraper($page, $scrapers)
|
||||
{
|
||||
$out_scraper = [];
|
||||
foreach ($scrapers as $scraper) {
|
||||
if (empty($scraper)) {
|
||||
continue;
|
||||
}
|
||||
$signature = html_entity_decode(
|
||||
$scraper['SIGNATURE'], ENT_QUOTES);
|
||||
if (self::checkSignature($page, $signature)) {
|
||||
$out_scraper['SIGNATURE'] = $signature;
|
||||
$out_scraper['ID'] = $scraper['ID'];
|
||||
$out_scraper['SCRAPE_RULES'] = html_entity_decode(
|
||||
$scraper['SCRAPE_RULES'], ENT_QUOTES);
|
||||
$out_scraper['NAME'] = $scraper['NAME'];
|
||||
break;
|
||||
}
|
||||
}
|
||||
return $out_scraper;
|
||||
}
|
||||
/**
|
||||
* Applies scrape rules to a given page. A scrape rule consists of
|
||||
* a sequence of xpaths delimited by ###. The first path is used
|
||||
* extract content from the page, the remaining xpaths are used
|
||||
* to delete content from the result.
|
||||
*
|
||||
* @param string $page the html page to operate on
|
||||
* @param string $scrape_rules_string a string of xpaths with ###
|
||||
* used as a delimeter
|
||||
* @return string the result of extracting first xpath content and
|
||||
* deleting from it according to the remaining xpath rules
|
||||
*/
|
||||
public static function applyScraperRules($page, $scrape_rules_string)
|
||||
{
|
||||
$scrape_rules = preg_split('/###/u',
|
||||
$scrape_rules_string, 0, PREG_SPLIT_NO_EMPTY);
|
||||
if (count($scrape_rules) > 0) {
|
||||
$temp_page = self::getContentByXquery($page,
|
||||
$scrape_rules[0]);
|
||||
unset($scrape_rules[0]);
|
||||
if (!empty($temp_page)) {
|
||||
foreach ($scrape_rules as $tag_to_remove) {
|
||||
$new_temp_page =
|
||||
self::removeContentByXquery($temp_page, $tag_to_remove);
|
||||
if (!empty($new_temp_page)) {
|
||||
$temp_page = $new_temp_page;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return empty($temp_page) ? $page : $temp_page;
|
||||
}
|
||||
/**
|
||||
* If $signature begins with '/', checks to see if applying
|
||||
* the xpath in $signature to $page results
|
||||
* in a non-empty dom node list. Otherwise, does a match of the
|
||||
* regex (without matching start and end delimiters (say, /)
|
||||
* against $page and returns whether found
|
||||
*
|
||||
* @param string $page a web document to check
|
||||
* @param string $signature an xpath to check against
|
||||
* @return boolean true if the given xpath return a non empty dom node list
|
||||
*/
|
||||
public static function checkSignature($page, $signature)
|
||||
{
|
||||
if ($signature[0] == '/') {
|
||||
$dom = new \DOMDocument();
|
||||
$results = false;
|
||||
restore_error_handler();
|
||||
if (@$dom->loadHTML($page)) {
|
||||
if ($xpath = new \DOMXpath($dom)) {
|
||||
$results = $xpath->query($signature);
|
||||
}
|
||||
}
|
||||
set_error_handler(C\NS_LIB . "yioop_error_handler");
|
||||
return !empty($results->length) && $results->length > 0;
|
||||
} else {
|
||||
return (mb_ereg($signature, $page) !== false);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Get the contents of a document via an xpath
|
||||
* @param string $page a document to apply the xpath query against
|
||||
* @param string $query the xpath query to run
|
||||
*
|
||||
* @return string the content found as a string, otherwise an empty string
|
||||
*/
|
||||
public static function getContentByXquery($page, $query)
|
||||
{
|
||||
$result = "";
|
||||
$dom = new \DOMDocument();
|
||||
restore_error_handler();
|
||||
if (@$dom->loadHTML($page)) {
|
||||
$xpath = new \DOMXPath($dom);
|
||||
$xpath_result = $xpath->query($query);
|
||||
if (!empty($xpath_result) && $xpath_result->length > 0) {
|
||||
$result = $dom->saveHTML($xpath_result->item(0));
|
||||
}
|
||||
}
|
||||
set_error_handler(C\NS_LIB . "yioop_error_handler");
|
||||
return $result;
|
||||
}
|
||||
/**
|
||||
* Removes from the contents of a document the results of
|
||||
* an xpath query
|
||||
* @param string $page a document to apply the xpath query against
|
||||
* @param string $query the xpath query to run
|
||||
*
|
||||
* @return string the content less the xpath results as an HTML document
|
||||
*/
|
||||
public static function removeContentByXquery($page, $query)
|
||||
{
|
||||
$result = $page;
|
||||
$dom = new \DOMDocument();
|
||||
restore_error_handler();
|
||||
if (@$dom->loadHTML($page)) {
|
||||
$xpath = new \DOMXPath($dom);
|
||||
$xpath_result = $xpath->query($query);
|
||||
if ($xpath_result->length > 0) {
|
||||
$len = $xpath_result->length;
|
||||
for ($i = 0; $i < $len; $i++) {
|
||||
$node = $xpath_result->item($i);
|
||||
$parent = $node->parentNode;
|
||||
if ($parent) {
|
||||
$parent->removeChild($node);
|
||||
}
|
||||
}
|
||||
$result = $dom->saveHTML();
|
||||
}
|
||||
}
|
||||
set_error_handler(C\NS_LIB . "yioop_error_handler");
|
||||
return $result;
|
||||
}
|
||||
}
|
143
src/library/StringArray.php
Normal file
143
src/library/StringArray.php
Normal file
|
@ -0,0 +1,143 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* Load charCopy
|
||||
*/
|
||||
require_once __DIR__."/Utility.php";
|
||||
/**
|
||||
* Memory efficient implementation of persistent arrays
|
||||
*
|
||||
* The standard array objects in php and even spl have a large amount of
|
||||
* overhead. The point of this class is to have the size as close to the
|
||||
* optimal as possible
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class StringArray extends PersistentStructure
|
||||
{
|
||||
/**
|
||||
* Number of items to be stored in the StringArray
|
||||
* @var int
|
||||
*/
|
||||
public $num_values;
|
||||
/**
|
||||
* Size of each item in bytes to be stored
|
||||
* @var int
|
||||
*/
|
||||
public $data_size;
|
||||
/**
|
||||
* Number of bytes of storage need by the string array
|
||||
* @var int
|
||||
*/
|
||||
public $string_array_size;
|
||||
/**
|
||||
* Character string used to store the packed data of the StringArray
|
||||
* @var string
|
||||
*/
|
||||
public $string_array;
|
||||
/**
|
||||
* Initiliazes the fields of the StringArray and its parent class
|
||||
* PersistentStructure. Creates a null filled string array of size
|
||||
* $this->string_array_size to stored data in.
|
||||
*
|
||||
* @param string $fname the name of the file to store data persistently in
|
||||
* @param int $num_values the number of items the StringArray will store
|
||||
* @param int $data_size the size in bytes of a single item
|
||||
* @param int $save_frequency how often the StringArray should be stored to
|
||||
* disk
|
||||
*/
|
||||
public function __construct($fname, $num_values, $data_size,
|
||||
$save_frequency = self::DEFAULT_SAVE_FREQUENCY)
|
||||
{
|
||||
$this->num_values = $num_values;
|
||||
$this->data_size = $data_size;
|
||||
$this->string_array_size = $num_values * ($data_size);
|
||||
$this->string_array = pack("x". $this->string_array_size);
|
||||
parent::__construct($fname, $save_frequency);
|
||||
}
|
||||
/**
|
||||
* Load a StringArray from a file
|
||||
*
|
||||
* @param string $fname the name of the file to load the StringArray from
|
||||
* @return object the PersistentStructure loaded
|
||||
*/
|
||||
public static function load($fname)
|
||||
{
|
||||
$fh = fopen($fname, "rb");
|
||||
$array_size = unpackInt(fread($fh, 4));
|
||||
$array = fread($fh, $array_size);
|
||||
$object = unserialize(fread($fh,
|
||||
filesize($fname) -4 - $array_size));
|
||||
$object->string_array = & $array;
|
||||
fclose($fh);
|
||||
return $object;
|
||||
}
|
||||
/**
|
||||
* Save the StringArray to its filename
|
||||
*/
|
||||
public function save()
|
||||
{
|
||||
$fh = fopen($this->filename, "wb");
|
||||
$tmp = & $this->string_array;
|
||||
fwrite($fh, packInt($this->string_array_size));
|
||||
fwrite($fh, $this->string_array);
|
||||
unset($this->string_array);
|
||||
fwrite($fh, serialize($this));
|
||||
$this->string_array = & $tmp;
|
||||
fclose($fh);
|
||||
}
|
||||
/**
|
||||
* Looks up the ith item in the StringArray
|
||||
*
|
||||
* @param int $i array index of item to look up
|
||||
* @return string the looked-up item of length $this->data_size
|
||||
*/
|
||||
public function get($i)
|
||||
{
|
||||
$data_size = $this->data_size;
|
||||
return substr($this->string_array, $i * $data_size, $data_size);
|
||||
}
|
||||
/**
|
||||
* Puts data into the ith item of the StringArray
|
||||
*
|
||||
* @param int $i array index of where to store data
|
||||
* @param string $data at least $this->data_size many bytes of data to
|
||||
* store
|
||||
*/
|
||||
public function put($i, $data)
|
||||
{
|
||||
$data_size = $this->data_size;
|
||||
$start = $i * $data_size;
|
||||
charCopy($data, $this->string_array, $start, $data_size);
|
||||
}
|
||||
}
|
351
src/library/SuffixTree.php
Normal file
351
src/library/SuffixTree.php
Normal file
|
@ -0,0 +1,351 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Data structure used to maintain a suffix tree for a passage of words.
|
||||
* The suffix tree is constructed using the linear time algorithm of
|
||||
* Ukkonen, E. (1995). "On-line construction of suffix trees".
|
||||
* Algorithmica 14 (3): 249–260.
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class SuffixTree
|
||||
{
|
||||
/**
|
||||
* The root node of the suffix trees
|
||||
* @var array
|
||||
*/
|
||||
public $root;
|
||||
/**
|
||||
* Index of last node added to the suffix tree in the array used to
|
||||
* hold the suffix tree data structures
|
||||
* @var int
|
||||
*/
|
||||
public $last_added;
|
||||
/**
|
||||
* Position in the $this->text up to which we have created a suffix tree
|
||||
* so far
|
||||
* @var int
|
||||
*/
|
||||
public $pos;
|
||||
/**
|
||||
* If in a given step in constructing the suffix tree we split the
|
||||
* active edge and insert a new node and then have to do this
|
||||
* again in the same step, then we need to create a sym_link between
|
||||
* the suffix trees represented by these new nodes. This variable
|
||||
* keeps track of the index of the first node so we can do this.
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
public $need_sym_link;
|
||||
|
||||
/**
|
||||
* At a given stage in building the suffix tree how many new suffixes
|
||||
* we need to insert
|
||||
* @var int
|
||||
*/
|
||||
public $remainder;
|
||||
/**
|
||||
* Node which represents the left hand the start of the active edge
|
||||
* This is the edge that contains the last suffix inserted
|
||||
* @var int
|
||||
*/
|
||||
public $active_index;
|
||||
/**
|
||||
* Index into $this->text of starting word of active edge
|
||||
* @var int
|
||||
*/
|
||||
public $active_edge_index;
|
||||
/**
|
||||
* How many words from the start of the active edge label to get the
|
||||
* last suffix. If active edge label was: "a black cat a black" and
|
||||
* $active_len was 2, then would have "a black" from the first two chars.
|
||||
* @var int
|
||||
*/
|
||||
public $active_len;
|
||||
/**
|
||||
* Number of elements in $this->text. i.e., count($this->text)
|
||||
* @var int
|
||||
*/
|
||||
public $size;
|
||||
|
||||
/**
|
||||
* The sequence of terms, one array entry per term, that a suffix tree is
|
||||
* to be made from
|
||||
* @var array
|
||||
*/
|
||||
public $text;
|
||||
/**
|
||||
* Used to hold the suffix tree data structure (represented as a sequence
|
||||
* of nodes)
|
||||
* @var array
|
||||
*/
|
||||
public $tree;
|
||||
/**
|
||||
* Upper bound on the length of any path in the tree
|
||||
*/
|
||||
const INFTY = 2000000000;
|
||||
/**
|
||||
* Initializes a suffix tree based on the supplied array of terms.
|
||||
*
|
||||
* @param array $text a sequence of terms to build the suffix tree for
|
||||
*/
|
||||
public function __construct($text)
|
||||
{
|
||||
$this->text = $text;
|
||||
$this->size = count($text);
|
||||
$this->buildTree();
|
||||
}
|
||||
/**
|
||||
* Builds the complete suffix tree for the text currently stored in
|
||||
* $this->text. If you change this text and call this method again,
|
||||
* it build a new tree based on the new text. Uses Ukkonen
|
||||
*/
|
||||
public function buildTree()
|
||||
{
|
||||
$this->tree = [];
|
||||
$this->need_sym_link = 0;
|
||||
$this->last_added = 0;
|
||||
$this->pos = -1;
|
||||
$this->remainder = 0;
|
||||
$this->active_edge_index = 0;
|
||||
$this->active_len = 0;
|
||||
$this->root = $this->makeNode(-1, -1);
|
||||
$this->active_index = $this->root;
|
||||
$num_terms = count($this->text);
|
||||
for ($i = 0; $i < $num_terms; $i++) {
|
||||
$this->suffixTreeExtend();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Makes a new node for the suffix tree structure. This node
|
||||
* is inserted at the end of the tree so far. A node is associative
|
||||
* array consisting of the fields "start" whose value
|
||||
* is the starting location in $this->text for this node,
|
||||
* "end" location in $this->text up to which this node is
|
||||
* responsible, "sym_link" is a link to an isomorphic subtree for the
|
||||
* purposes of building the suffix tree, and "next" is an array of
|
||||
* next children in the tree.
|
||||
*
|
||||
* @param int $start what to use as the start value mentioned above
|
||||
* @param int $end what to use as the start value mentioned above
|
||||
*/
|
||||
public function makeNode($start, $end = self::INFTY)
|
||||
{
|
||||
$node = [];
|
||||
$node["start"] = $start;
|
||||
$node["end"] = $end;
|
||||
$node["sym_link"] = 0;
|
||||
$node["next"] = [];
|
||||
$this->tree[++$this->last_added] = $node;
|
||||
return $this->last_added;
|
||||
}
|
||||
/**
|
||||
* The number of elements out of $this->text that this node is currently
|
||||
* responsible for
|
||||
*
|
||||
* @param array& $node the node to compute the length of
|
||||
*/
|
||||
public function edgeLength(&$node)
|
||||
{
|
||||
return min($node["end"], $this->pos + 1) - $node["start"];
|
||||
}
|
||||
/**
|
||||
* If in a given step in constructing the suffix tree we split the
|
||||
* active edge and insert a new node and then have to do this
|
||||
* again in the same step, then we need to create a sym_link between
|
||||
* the suffix trees represented by these new nodes. If in the current
|
||||
* step it is necessary to add a sym_link this method sets the
|
||||
* $this->need_sym_link node's "sym_link" field to $index which is supposed
|
||||
* be the index of the second created node.
|
||||
*
|
||||
* @param int $index the index of the a created node in a given step.
|
||||
* ($this->need_sym_link will be greater than 0 if it is the second
|
||||
* created node of the step)
|
||||
*/
|
||||
public function addSuffixLink($index)
|
||||
{
|
||||
if ($this->need_sym_link > 0) {
|
||||
$this->tree[$this->need_sym_link]["sym_link"] = $index;
|
||||
}
|
||||
$this->need_sym_link = $index;
|
||||
}
|
||||
/**
|
||||
* Used to set the active point to the node given by $index
|
||||
*
|
||||
* @param int $index which node to use for setting
|
||||
* @return if the current active edge is longer than $index's edge length
|
||||
* then don't update and return false; otherwise, return true
|
||||
*/
|
||||
public function walkDown($index)
|
||||
{
|
||||
$edge_length = $this->edgeLength($this->tree[$index]);
|
||||
if ($this->active_len >= $edge_length) {
|
||||
$this->active_edge_index += $edge_length;
|
||||
$this->active_len -= $edge_length;
|
||||
$this->active_index = $index;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Given a suffix tree of the array of terms in $this->text up to
|
||||
* $this->pos, adds one to pos and build the suffix tree up to this
|
||||
* new value. i.e., the text with one more term added.
|
||||
*/
|
||||
public function suffixTreeExtend()
|
||||
{
|
||||
$this->pos++;
|
||||
$term = $this->text[$this->pos];
|
||||
$this->need_sym_link = -1;
|
||||
$this->remainder++;
|
||||
if (!isset($this->text[$this->active_edge_index])) {
|
||||
return;
|
||||
}
|
||||
while($this->remainder>0 && isset($this->text[$this->active_edge_index])
|
||||
&& isset($this->text[$this->pos]) ) {
|
||||
if ($this->active_len == 0) {
|
||||
$this->active_edge_index = $this->pos;
|
||||
}
|
||||
$active_term = $this->text[$this->active_edge_index];
|
||||
if (!isset($this->tree[$this->active_index]["next"][$active_term])){
|
||||
$leaf = $this->makeNode($this->pos);
|
||||
$this->tree[$this->active_index]["next"][$active_term] = $leaf;
|
||||
$this->addSuffixLink($this->active_index); //rule 2
|
||||
} else {
|
||||
$next = $this->tree[$this->active_index]["next"][$active_term];
|
||||
if ($this->walkDown($next)) {
|
||||
continue; //observation 2
|
||||
}
|
||||
$start = $this->tree[$next]["start"];
|
||||
if ($this->text[$start + $this->active_len] == $term) {
|
||||
//observation 1
|
||||
$this->active_len++;
|
||||
$this->addSuffixLink($this->active_index); //observation 3
|
||||
break;
|
||||
}
|
||||
$splitNode = $this->makeNode($start, $start+$this->active_len);
|
||||
$active_term = $this->text[$this->active_edge_index];
|
||||
$this->tree[$this->active_index]["next"][$active_term] =
|
||||
$splitNode;
|
||||
$leaf = $this->makeNode($this->pos);
|
||||
$this->tree[$splitNode]["next"][$term] = $leaf;
|
||||
$this->tree[$next]["start"] += $this->active_len;
|
||||
$this->tree[$splitNode]["next"][
|
||||
$this->text[$this->tree[$next]["start"]]] = $next;
|
||||
$this->addSuffixLink($splitNode); //rule 2
|
||||
}
|
||||
$this->remainder--;
|
||||
if ($this->active_index == $this->root && $this->active_len > 0) {
|
||||
//rule 1
|
||||
$this->active_len--;
|
||||
$this->active_edge_index = $this->pos - $this->remainder + 1;
|
||||
} else {
|
||||
$this->active_index =
|
||||
($this->tree[$this->active_index]["sym_link"] > 0 ) ?
|
||||
$this->tree[$this->active_index]["sym_link"] : $this->root;
|
||||
//rule 3
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Recursive function used to compute the maximal phrases in a document
|
||||
* as well as their conditional maximal subphrases.
|
||||
*
|
||||
* @param int $index a node in the suffix tree
|
||||
* @param string $path from root to current node
|
||||
* @param int $len number of nodes from root to current node in suffix tree
|
||||
* @param array& $maximal assoc array of phrase => (cond_max => pos of
|
||||
* conditional maximal subphrase, [0] => pos_1st_occurrence of phrase,
|
||||
* [1]=>pos_2nd_occurrence of phrase, etc)
|
||||
*/
|
||||
public function outputMaximal($index, $path, $len, &$maximal)
|
||||
{
|
||||
$start = $this->tree[$index]["start"];
|
||||
$end = $this->tree[$index]["end"];
|
||||
if ($start >= 0 && $end >= 0) {
|
||||
$tmp_terms = array_slice($this->text, $start, $end - $start);
|
||||
$tmp = implode(" ", $tmp_terms);
|
||||
$num = count($tmp_terms);
|
||||
if ($path != "") {
|
||||
$begin = $start - $len;
|
||||
$out_path = $path;
|
||||
if ($len > C\MAX_QUERY_TERMS) {
|
||||
$out_path = implode(" ", array_slice($this->text, $begin,
|
||||
C\MAX_QUERY_TERMS));
|
||||
}
|
||||
$maximal[$out_path][] = $begin;
|
||||
if (!isset($maximal[$out_path]["cond_max"])) {
|
||||
$maximal[$out_path]["cond_max"] =
|
||||
strpos($out_path, " ") + 1;
|
||||
}
|
||||
if ($len > 1 && $len < C\MAX_QUERY_TERMS) {
|
||||
$cond_max = strlen($path) + 1;
|
||||
}
|
||||
$path .= " ".$tmp;
|
||||
$len += $num;
|
||||
if (isset($cond_max)) {
|
||||
$out_path = $path;
|
||||
if ($len > C\MAX_QUERY_TERMS) {
|
||||
$out_path = implode(" ", array_slice($this->text,
|
||||
$begin, C\MAX_QUERY_TERMS));
|
||||
}
|
||||
$maximal[$out_path]["cond_max"] = $cond_max;
|
||||
}
|
||||
} else {
|
||||
$len = $num;
|
||||
$path = $tmp;
|
||||
}
|
||||
}
|
||||
if ($end == self::INFTY) {
|
||||
$begin = $this->size - $len;
|
||||
$out_path = $path;
|
||||
if ($len > C\MAX_QUERY_TERMS) {
|
||||
$out_path = implode(" ", array_slice($this->text, $begin,
|
||||
C\MAX_QUERY_TERMS));
|
||||
}
|
||||
$maximal[$out_path][] = $begin;
|
||||
if (!isset($maximal[$out_path]["cond_max"])) {
|
||||
$maximal[$out_path]["cond_max"] =
|
||||
strpos($out_path, " ") + 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
foreach ($this->tree[$index]["next"] as $sub_index) {
|
||||
$this->outputMaximal($sub_index, $path, $len, $maximal);
|
||||
}
|
||||
}
|
||||
}
|
361
src/library/Thesaurus.php
Normal file
361
src/library/Thesaurus.php
Normal file
|
@ -0,0 +1,361 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Shailesh Padave shaileshpadave49@gmail.com
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Class used to reorder the last 10 links computed by PhraseModel based on
|
||||
* thesaurus semantic information. For English, thesaurus semantic information
|
||||
* can be provided by WordNet, a lexical English database
|
||||
* available at http://wordnet.princeton.edu/
|
||||
* To enable, you this have to define WORDNET_EXEC in your local_config file.
|
||||
* The idea behind thresaurus reordering is that given a query, it
|
||||
* is tagged for parts of speech. Each term is then looked up in thesaurus for
|
||||
* those parts of speech. Representative phrases for those term senses are
|
||||
* extracted from the ranked thesaurus output and a set of rewrites of the
|
||||
* original query are created. By looking up the number
|
||||
* of times these rewrites occur in the searched index the top two phrases
|
||||
* that represent the original query are computed.The BM25 similarity of these
|
||||
* phrases is then scored against each of the 10 output summaries of
|
||||
* PhraseModel and used to reorder the results.
|
||||
* To add thesaurus reordering for a different locale, two methods need to be
|
||||
* written in that locale tokenizer.php file
|
||||
* tagPartsOfSpeechPhrase($phrase) which on an input phrase return a string
|
||||
* where each term_i in the phrase has been replace with term_i~pos
|
||||
* where pos is a two character part of speech NN, VB, AJ, AV, or NA (if
|
||||
* none of the previous apply)
|
||||
* scoredThesaurusMatches($term, $word_type, $whole_query) which takes
|
||||
* a term from an original whole_query which has been tagged to be
|
||||
* one of the types VB (for verb), NN (for noun), AJ (for adjective),
|
||||
* AV (for adverb), or NA (for anything else), it outputs
|
||||
* a sequence of (score => array of thesaurus terms) associations.
|
||||
* The score representing one word sense of term
|
||||
* Given that these methods have been implemented if the use_thesaurus field
|
||||
* of that language tokenizer is set to true, the thesaurus will be used.
|
||||
*/
|
||||
class Thesaurus
|
||||
{
|
||||
/**
|
||||
* Extracts similar phrases to the input query using thesaurus results.
|
||||
* Part of speech tagging is processed on input and the output is
|
||||
* looked up in the thesaurus. USing this a ranked list of alternate
|
||||
* query phrases is created.
|
||||
* For those phrases, counts in the Yioop index are calculated
|
||||
* and the top two phrases are selected.
|
||||
* @param string $orig_query input query from user
|
||||
* @param string $index_name selected index for search engine
|
||||
* @param string $lang locale tag for the query
|
||||
* @param integer $threshold once count in posting list for any word
|
||||
* reaches to threshold then return the number
|
||||
* @return array of top two words
|
||||
*/
|
||||
public static function getSimilarPhrases($orig_query, $index_name,
|
||||
$lang, $threshold = 10)
|
||||
{
|
||||
$num_docs = [];
|
||||
$scores = [];
|
||||
|
||||
$suggested_queries =
|
||||
self::getInitialSuggestions($orig_query, $lang);
|
||||
foreach ($suggested_queries as $suggestion) {
|
||||
$num_docs[$suggestion] =
|
||||
self::numDocsIndex($suggestion, $threshold, $index_name, $lang);
|
||||
}
|
||||
arsort($num_docs);
|
||||
$result = [];
|
||||
$i = 0;
|
||||
foreach ($num_docs as $k => $v) {
|
||||
$result[$i] = $k;
|
||||
$i++;
|
||||
if ($i >= 2) { break; }
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
/**
|
||||
* Gets array of BM25 scores for given input array of summaries
|
||||
* and thesaurus generated queries
|
||||
* @param array $similar_phrases an array of thesaurus generated queries
|
||||
* @param array $summaries an array of summaries which is generated
|
||||
* during crawl time.
|
||||
* @return array of BM25 score for each document based on the thesaurus
|
||||
* simimar phrases
|
||||
*/
|
||||
public static function scorePhrasesSummaries($similar_phrases, $summaries)
|
||||
{
|
||||
$score = [];
|
||||
//if there are no similar words then
|
||||
if (empty($similar_phrases)) {
|
||||
return [];
|
||||
} else {
|
||||
$num_phrases = count($similar_phrases);
|
||||
for ($i = 0; $i < $num_phrases; $i++) {
|
||||
$phrase = $similar_phrases[$i];
|
||||
$terms = explode(' ', $phrase);
|
||||
$summaries = self::changeCaseOfStringArray($summaries);
|
||||
$idf = self::calculateIDF($summaries, $terms);
|
||||
$tf = self::calculateTFBM25($summaries, $terms);
|
||||
$num_summaries = count($summaries);
|
||||
$num_terms = count($terms);
|
||||
$bm25_result[$i] =
|
||||
self::calculateBM25($idf, $tf, $num_terms, $num_summaries);
|
||||
}
|
||||
if (count($bm25_result) == 1) {
|
||||
for ($i = 0; $i < $num_summaries; $i++) {
|
||||
$temp = 0;
|
||||
$temp = $bm25_result[0][$i];
|
||||
$score[$i] = $temp;
|
||||
}
|
||||
} else {
|
||||
for ($i = 0; $i < $num_summaries; $i++) {
|
||||
$temp = 0;
|
||||
$temp = $bm25_result[0][$i] * (2/3) +
|
||||
$bm25_result[1][$i] * (1/3);
|
||||
$score[$i] = $temp;
|
||||
}
|
||||
}
|
||||
return $score;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Computes suggested related phrases from thesaurus based on part of
|
||||
* speech done on each query term.
|
||||
*
|
||||
* @param string $query query entered by user
|
||||
* @param string $lang locale tag for the query
|
||||
* @return string array $suggestion consisting of phrases suggested to
|
||||
* be similar in meaning to some sens of the query
|
||||
*/
|
||||
public static function getInitialSuggestions($query, $lang)
|
||||
{
|
||||
$tokenizer = PhraseParser::getTokenizer($lang);
|
||||
$pos_query = $tokenizer->tagPartsOfSpeechPhrase($query);
|
||||
$max_len = 25;
|
||||
$replacement_phrases = [];
|
||||
$suggestions = [];
|
||||
$terms = preg_split("/\s+|\-/", trim($query));
|
||||
$pos_terms = preg_split("/\s+/",
|
||||
trim($pos_query), -1, PREG_SPLIT_NO_EMPTY);
|
||||
$num_pos_terms = count($pos_terms);
|
||||
$word_type = null;
|
||||
$similar_words = [];
|
||||
$known_word_types = ["NN", "VB", "AJ", "AV"];
|
||||
for ($i = 0; $i < $num_pos_terms; $i++) {
|
||||
$pos = strpos($pos_terms[$i], '~');
|
||||
$word_type = trim(substr($pos_terms[$i], $pos + 1));
|
||||
if (!in_array($word_type, $known_word_types)) {
|
||||
$word_type = "NA";
|
||||
}
|
||||
$current_word = substr($pos_terms[$i], 0, $pos);
|
||||
if ($word_type != "NA") {
|
||||
$similar_phrases = $tokenizer->scoredThesaurusMatches(
|
||||
$current_word, $word_type, $query);
|
||||
$highest_scoring_sense_phrases = ($similar_phrases) ?
|
||||
array_shift($similar_phrases): false;
|
||||
if ($highest_scoring_sense_phrases) {
|
||||
$replacement_phrases[$current_word] =
|
||||
$highest_scoring_sense_phrases;
|
||||
}
|
||||
}
|
||||
}
|
||||
$i = 0;
|
||||
foreach ($replacement_phrases as $words => $similar_phrases) {
|
||||
foreach ($similar_phrases as $phrase) {
|
||||
if (mb_strpos(trim($phrase), ' ') !== false) {
|
||||
$phrase = preg_replace('/~[\w]+/', '', $phrase);
|
||||
}
|
||||
$modified_query = preg_replace(
|
||||
'/' . $words . '/', trim($phrase), $query);
|
||||
if (mb_strlen($modified_query) < $max_len &&
|
||||
mb_strpos($modified_query, $query) === false) {
|
||||
$suggestions[$i] = $modified_query;
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $suggestions;
|
||||
}
|
||||
/**
|
||||
* Returns the number of documents in an index that a phrase occurs in.
|
||||
* If it occurs in more than threshold documents then cut off search.
|
||||
*
|
||||
* @param string $phrase to look up in index
|
||||
* @param int $threshold once count in posting list for any word
|
||||
* reaches to threshold then return the number
|
||||
* @param string $index_name selected index for search engine
|
||||
* @param string $lang locale tag for the query
|
||||
* @return int number of documents phrase occurs in
|
||||
*/
|
||||
public static function numDocsIndex($phrase, $threshold, $index_name, $lang)
|
||||
{
|
||||
PhraseParser::canonicalizePunctuatedTerms($phrase, $lang);
|
||||
$terms = PhraseParser::stemCharGramSegment($phrase, $lang);
|
||||
$num = count($terms);
|
||||
if ($index_name == null) {
|
||||
return 0;
|
||||
}
|
||||
if (count($terms) > C\MAX_QUERY_TERMS) {
|
||||
$terms = array_slice($terms, 0, C\MAX_QUERY_TERMS);
|
||||
}
|
||||
$whole_phrase = implode(" ", $terms);
|
||||
return IndexManager::numDocsTerm($whole_phrase, $index_name,
|
||||
$threshold);
|
||||
}
|
||||
/**
|
||||
* Lower cases an array of strings
|
||||
*
|
||||
* @param array $summaries strings to put into lower case
|
||||
* @return array with strings converted to lower case
|
||||
*/
|
||||
public static function changeCaseOfStringArray($summaries)
|
||||
{
|
||||
return explode("-!-", mb_strtolower(implode("-!-", $summaries)));
|
||||
}
|
||||
/**
|
||||
* Computes the BM25 of an array of documents given that the idf and
|
||||
* tf scores for these documents have already been computed
|
||||
*
|
||||
* @param array $idf inverse doc frequency for given query array
|
||||
* @param array $tf term frequency for given query array
|
||||
* @param $num_terms number of terms that make up input query
|
||||
* @param $num_summaries count for input summaries
|
||||
* @returns array consisting of BM25 scores for each document
|
||||
*/
|
||||
public static function calculateBM25($idf, $tf, $num_terms, $num_summaries)
|
||||
{
|
||||
$scores = [];
|
||||
for ($i = 0; $i < $num_terms; $i++) {
|
||||
for ($j = 0; $j < $num_summaries; $j++) {
|
||||
$bm25_score[$i][$j] = $idf[$i] * $tf[$i][$j];
|
||||
}
|
||||
}
|
||||
for ($i = 0; $i < $num_summaries; $i++) {
|
||||
$val = 0;
|
||||
for ($j = 0; $j < $num_terms; $j++) {
|
||||
$val += $bm25_score[$j][$i];
|
||||
}
|
||||
$scores[$i] = $val;
|
||||
}
|
||||
return $scores;
|
||||
}
|
||||
/**
|
||||
* Calculates the BM25 normalized term frequency of a set of terms in
|
||||
* a collection of text summaries
|
||||
*
|
||||
* @param array $summaries list of summary strings to compute BM25TF w.r.t
|
||||
* @param array $terms we want the term frequency computation for
|
||||
* @return array $tfbm25 a 2d array with rows being indexed by terms and
|
||||
* columns indexed by summaries and the values of an entry being
|
||||
* the tfbm25 score for that term in that document
|
||||
*/
|
||||
public static function calculateTFBM25($summaries, $terms)
|
||||
{
|
||||
$k1 = 1.5;
|
||||
$b = 0.75;
|
||||
$tf_values = [];
|
||||
$tfbm25 = [];
|
||||
$doc_length = strlen(implode("", $summaries));
|
||||
$num_summaries = count($summaries);
|
||||
if ($num_summaries!= 0) {
|
||||
$avg_length = $doc_length / $num_summaries;
|
||||
} else {
|
||||
$avg_length = 0;
|
||||
}
|
||||
$avg_length = max($avg_length, 1);
|
||||
$tf_values = self::calculateTermFreq($summaries, $terms);
|
||||
$num_terms =count($terms);
|
||||
for ($i = 0; $i < $num_terms; $i++) {
|
||||
for ($j = 0; $j < $num_summaries; $j++) {
|
||||
$frequency = $tf_values[$i][$j];
|
||||
$tfbm25[$i][$j] =
|
||||
($frequency * ($k1 + 1))/($frequency + $k1 *
|
||||
((1 - $b) + $b * ($doc_length/$avg_length)));
|
||||
}
|
||||
}
|
||||
return $tfbm25;
|
||||
}
|
||||
/**
|
||||
* Computes a 2D array of the number of occurences of term i in document j
|
||||
*
|
||||
* @param array $summaries documents to compute frequencies in
|
||||
* @param array $terms terms to compute frequencies for
|
||||
* @return array 2D array as described above
|
||||
*/
|
||||
public static function calculateTermFreq($summaries, $terms)
|
||||
{
|
||||
$tf_values = [];
|
||||
$num_terms = count($terms);
|
||||
$num_summaries = count($summaries);
|
||||
for ($i = 0; $i < $num_terms; $i++) {
|
||||
for ($j = 0; $j < $num_summaries; $j++) {
|
||||
if ($terms[$i] != "") {
|
||||
$frequency = substr_count($summaries[$j], $terms[$i]);
|
||||
$tf_values[$i][$j] = $frequency;
|
||||
} else {
|
||||
$tf_values[$i][$j] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $tf_values;
|
||||
}
|
||||
/**
|
||||
* To get the inverse document frequencies for a collection of terms in
|
||||
* a set of documents.
|
||||
* IDF(term_i) = log_10(# of document / # docs term i in)
|
||||
*
|
||||
* @param array $summaries documents to use in calculating IDF score
|
||||
* @param array $terms terms to compute IDF score for
|
||||
* @return array $idf 1D-array saying the inverse document frequency for
|
||||
* each term
|
||||
*/
|
||||
public static function calculateIDF($summaries, $terms)
|
||||
{
|
||||
$N = count($summaries);
|
||||
$Nt = [];
|
||||
$term_count = 0;
|
||||
$num_terms = count($terms);
|
||||
for ($i = 0; $i < $num_terms; $i++) {
|
||||
$cnt_Nt = 0;
|
||||
$term_count++;
|
||||
foreach ($summaries as $summary)
|
||||
{
|
||||
if (stripos($summary, $terms[$i]) !== false) {
|
||||
$cnt_Nt++;
|
||||
}
|
||||
}
|
||||
$Nt[$i] = $cnt_Nt;
|
||||
$idf[$i] = ($Nt[$i] != 0) ? log10($N / $Nt[$i]) : 0;
|
||||
}
|
||||
return $idf;
|
||||
}
|
||||
}
|
180
src/library/Trie.php
Normal file
180
src/library/Trie.php
Normal file
|
@ -0,0 +1,180 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Sandhya Vissapragada, Chris Pollett (separated out this
|
||||
* code into a separate file and cleaned up)
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* Implements a trie data structure which can be used to store terms read
|
||||
* from a dictionary in a succinct way
|
||||
*
|
||||
* @author Sandhya Vissapragada, Chris Pollett (rewrite +
|
||||
* documentation, multi-byte support)
|
||||
*/
|
||||
class Trie
|
||||
{
|
||||
/**
|
||||
* A nested array used to represent the trie
|
||||
* @var array
|
||||
*/
|
||||
public $trie_array;
|
||||
/**
|
||||
* The marker used to represent the end of an entry in a trie
|
||||
* @var string
|
||||
*/
|
||||
public $end_marker;
|
||||
/**
|
||||
* Creates and returnes an empty trie. Sets the end of term character
|
||||
*
|
||||
* @param string $end_marker end of term marker
|
||||
*/
|
||||
public function __construct($end_marker = " ")
|
||||
{
|
||||
$this->trie_array = [];
|
||||
$this->end_marker = $end_marker;
|
||||
}
|
||||
/**
|
||||
* Adds a term to the Trie
|
||||
*
|
||||
* @param string $term the term to be inserted
|
||||
* @return array $trie_array beneath last letter of term inserted
|
||||
*/
|
||||
public function add($term)
|
||||
{
|
||||
$trie_array = & $this->trie_array;
|
||||
$term_arr = explode(" ",$term);
|
||||
if (!isset($term_arr[1])) {
|
||||
$term_arr[1] = null;
|
||||
}
|
||||
for ($i = 0; $i < mb_strlen($term_arr[0],"utf-8"); $i++) {
|
||||
$character = mb_substr($term_arr[0], $i, 1, "utf-8");
|
||||
$enc_char = rawurlencode($character);
|
||||
// To avoid encoding the linefeed
|
||||
if ($enc_char == "%0A"){
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
// If letter doesnt exist then create one by
|
||||
// assigning new array
|
||||
if (!isset($trie_array[$enc_char])) {
|
||||
$trie_array[$enc_char] = [];
|
||||
}
|
||||
$trie_array = & $trie_array[$enc_char];
|
||||
}
|
||||
}
|
||||
// Set end of term marker
|
||||
$trie_array[$this->end_marker] = $term_arr[1];
|
||||
return $trie_array;
|
||||
}
|
||||
/**
|
||||
* Returns the sub trie_array under $term in
|
||||
* $this->trie_array. If $term does not exist in $trie->trie_array
|
||||
* returns false
|
||||
*
|
||||
* @param string $term term to look up
|
||||
* @return array $trie_array subtrie under term
|
||||
*/
|
||||
public function exists($term)
|
||||
{
|
||||
$trie_array = & $this->trie_array;
|
||||
$len = mb_strlen($term,"utf-8");
|
||||
for ($i = 0; $i < $len; $i++) {
|
||||
if ($trie_array == null){
|
||||
return false;
|
||||
}
|
||||
if ($trie_array != $this->end_marker) {
|
||||
$character = mb_substr($term, $i, 1, "utf-8");
|
||||
$enc_char = rawurlencode($character);
|
||||
if (!isset($trie_array[$enc_char])) {
|
||||
return false;
|
||||
}
|
||||
if ($trie_array[$enc_char] != $this->end_marker) {
|
||||
$trie_array = & $trie_array[$enc_char];
|
||||
}
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return $trie_array;
|
||||
}
|
||||
/**
|
||||
* Returns all the terms in the trie beneath the provided term prefix
|
||||
*
|
||||
* @param string $prefix of term to look up
|
||||
* @param int $max_results maximum number of strings to return
|
||||
* @return array $terms under $prefix
|
||||
*/
|
||||
public function getValues($prefix, $max_results)
|
||||
{
|
||||
$trie_array = $this->exists($prefix);
|
||||
if (!$trie_array) {
|
||||
return false;
|
||||
}
|
||||
return $this->getValuesTrieArray($trie_array, $prefix, $max_results);
|
||||
}
|
||||
/**
|
||||
* Computes the suffixes $count,...$max_results-$count in the trie_array
|
||||
* beneath the provided $find_more is true. Prepends $prefix to each
|
||||
* and returns the array of the result.
|
||||
*
|
||||
* @param array $trie_array a nested array representing a trie to look
|
||||
* up suffixes in
|
||||
* @param string $prefix to prepend to each found suffix
|
||||
* @param int $max_results maximum number of strings to return
|
||||
* @param int $count which suffix in trie_array to start with
|
||||
* @param bool $find_more whether to try to look up or not (stops recursion)
|
||||
* @return array $terms a list of ($prefix.suffix1, $prefix, $suffix2,...)
|
||||
*/
|
||||
private function getValuesTrieArray($trie_array, $prefix, $max_results,
|
||||
&$count = 0, &$find_more = true)
|
||||
{
|
||||
$end_marker = $this->end_marker;
|
||||
$terms = [];
|
||||
if ($trie_array != null && $find_more) {
|
||||
foreach ($trie_array as $character => $subtrie) {
|
||||
if ($character != $end_marker) {
|
||||
$new_terms =
|
||||
$this->getValuesTrieArray($subtrie,
|
||||
$prefix . urldecode($character),
|
||||
$max_results, $count, $find_more);
|
||||
$terms = array_merge($terms, $new_terms);
|
||||
} else {
|
||||
$count++;
|
||||
if ($count > $max_results) {
|
||||
$find_more = false;
|
||||
}
|
||||
$terms[] = $prefix;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $terms;
|
||||
}
|
||||
}
|
169
src/library/UnitTest.php
Normal file
169
src/library/UnitTest.php
Normal file
|
@ -0,0 +1,169 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
/**
|
||||
* Base class for all the SeekQuarry/Yioop engine Unit tests
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
abstract class UnitTest
|
||||
{
|
||||
/**
|
||||
* Used to store the results for each test sub case
|
||||
*/
|
||||
public $test_case_results;
|
||||
/**
|
||||
* Used to hold objects to be used in tests
|
||||
* @var array
|
||||
*/
|
||||
public $test_objects;
|
||||
/**
|
||||
* The suffix that all TestCase methods need to have to be called by run()
|
||||
*/
|
||||
const case_name = "TestCase";
|
||||
/**
|
||||
* Contructor should be overriden to do any set up that occurs before
|
||||
* and test cases
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
}
|
||||
/**
|
||||
* Execute each of the test cases of this unit test and return the results
|
||||
* @return array test case results
|
||||
*/
|
||||
public function run()
|
||||
{
|
||||
$test_results = [];
|
||||
$methods = get_class_methods(get_class($this));
|
||||
foreach ($methods as $method) {
|
||||
$this->test_objects = null;
|
||||
$this->setUp();
|
||||
$len = strlen($method);
|
||||
if (substr_compare(
|
||||
$method, self::case_name, $len - strlen(self::case_name)) == 0){
|
||||
$this->test_case_results = [];
|
||||
$this->$method();
|
||||
$test_results[$method] = $this->test_case_results;
|
||||
}
|
||||
$this->tearDown();
|
||||
}
|
||||
return $test_results;
|
||||
}
|
||||
/**
|
||||
* Checks that $x can coerced to true, the result of the
|
||||
* test is added to $this->test_case_results
|
||||
*
|
||||
* @param mixed $x item to check
|
||||
* @param string $description information about this test subcase
|
||||
*/
|
||||
public function assertTrue($x, $description = "")
|
||||
{
|
||||
$sub_case_num = count($this->test_case_results);
|
||||
$test = [];
|
||||
$test['NAME'] = "Case Test $sub_case_num assertTrue $description";
|
||||
if ($x) {
|
||||
$test['PASS'] = true;
|
||||
} else {
|
||||
$test['PASS'] = false;
|
||||
}
|
||||
$this->test_case_results[] = $test;
|
||||
}
|
||||
/**
|
||||
* Checks that $x can coerced to false, the result of the
|
||||
* test is added to $this->test_case_results
|
||||
*
|
||||
* @param mixed $x item to check
|
||||
* @param string $description information about this test subcase
|
||||
*/
|
||||
public function assertFalse($x, $description = "")
|
||||
{
|
||||
$sub_case_num = count($this->test_case_results);
|
||||
$test = [];
|
||||
$test['NAME'] = "Case Test $sub_case_num assertFalse $description";
|
||||
if (!$x) {
|
||||
$test['PASS'] = true;
|
||||
} else {
|
||||
$test['PASS'] = false;
|
||||
}
|
||||
$this->test_case_results[] = $test;
|
||||
}
|
||||
/**
|
||||
* Checks that $x and $y are the same, the result of the
|
||||
* test is added to $this->test_case_results
|
||||
*
|
||||
* @param mixed $x a first item to compare
|
||||
* @param mixed $y a second item to compare
|
||||
* @param string $description information about this test subcase
|
||||
*/
|
||||
public function assertEqual($x, $y, $description = "")
|
||||
{
|
||||
$sub_case_num = count($this->test_case_results);
|
||||
$test = [];
|
||||
$test['NAME'] = "Case Test $sub_case_num assertEqual $description";
|
||||
if ($x == $y) {
|
||||
$test['PASS'] = true;
|
||||
} else {
|
||||
$test['PASS'] = false;
|
||||
}
|
||||
$this->test_case_results[] = $test;
|
||||
}
|
||||
/**
|
||||
* Checks that $x and $y are not the same, the result of the
|
||||
* test is added to $this->test_case_results
|
||||
*
|
||||
* @param mixed $x a first item to compare
|
||||
* @param mixed $y a second item to compare
|
||||
* @param string $description information about this test subcase
|
||||
*/
|
||||
public function assertNotEqual($x, $y, $description = "")
|
||||
{
|
||||
$sub_case_num = count($this->test_case_results);
|
||||
$test = [];
|
||||
$test['NAME'] = "Case Test $sub_case_num assertNotEqual $description";
|
||||
if ($x != $y) {
|
||||
$test['PASS'] = true;
|
||||
} else {
|
||||
$test['PASS'] = false;
|
||||
}
|
||||
$this->test_case_results[] = $test;
|
||||
}
|
||||
/**
|
||||
* This method is called before each test case is run to set up the
|
||||
* given test case
|
||||
*/
|
||||
abstract function setUp();
|
||||
/**
|
||||
* This method is called after each test case is run to clean up
|
||||
*/
|
||||
abstract function tearDown();
|
||||
}
|
294
src/library/UpgradeFunctions.php
Normal file
294
src/library/UpgradeFunctions.php
Normal file
|
@ -0,0 +1,294 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* This file contains global functions to check whether
|
||||
* upgrading the database or locales is needed as wells as auxiliary functions
|
||||
* to be used by the VersionFunctions.php code to actually carry out
|
||||
* upgrades between versions
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
use seekquarry\yioop\models as M;
|
||||
use seekquarry\yioop\models\datasources as D;
|
||||
|
||||
/** For Yioop global defines */
|
||||
require_once __DIR__."/../configs/Config.php";
|
||||
/**
|
||||
* Checks to see if the locale data of Yioop! of a locale in the work dir is
|
||||
* older than the currently running Yioop!
|
||||
*
|
||||
* @param string $locale_tag locale to check directory of
|
||||
*/
|
||||
function upgradeLocalesCheck($locale_tag)
|
||||
{
|
||||
if (!C\PROFILE) {
|
||||
return;
|
||||
}
|
||||
$dir_locale_tag = str_replace("-", "_", $locale_tag);
|
||||
$config_name = C\LOCALE_DIR."/$dir_locale_tag/configure.ini";
|
||||
$fallback_config_name =
|
||||
C\FALLBACK_LOCALE_DIR."/$dir_locale_tag/configure.ini";
|
||||
if (filemtime($fallback_config_name) > filemtime($config_name)) {
|
||||
return "locale";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* If the locale data of Yioop! in the work directory is older than the
|
||||
* currently running Yioop! then this function is called to at least
|
||||
* try to copy the new strings into the old profile.
|
||||
*/
|
||||
function upgradeLocales()
|
||||
{
|
||||
if (!C\PROFILE) return;
|
||||
$locale = new M\LocaleModel();
|
||||
$locale->initialize(C\DEFAULT_LOCALE);
|
||||
$force_folders = [];
|
||||
/*
|
||||
if we're upgrading version2 to 3 we want to make sure stemmer becomes
|
||||
tokenizer, version3 to 4 pushes out stopwordsRemover used for
|
||||
summarization. version 6 to 7 adds stemmers for french, english,
|
||||
german.
|
||||
version 7 to 8 adds stemmers for russian and spanish
|
||||
*/
|
||||
if (empty($locale->configure['strings']["view_locale_version15"])) {
|
||||
$force_folders = ["resources"];
|
||||
upgradePublicHelpWiki($locale->db);
|
||||
}
|
||||
$locale->extractMergeLocales($force_folders);
|
||||
}
|
||||
/**
|
||||
* Used to force push the default Public and Wiki pages into the current
|
||||
* database
|
||||
* @param object& $db datasource to use to upgrade
|
||||
*/
|
||||
function upgradePublicHelpWiki(&$db)
|
||||
{
|
||||
/** For new wiki pages */
|
||||
require_once C\BASE_DIR."/configs/PublicHelpPages.php";
|
||||
$group_model = new M\GroupModel(C\DB_NAME, false);
|
||||
$group_model->db = $db;
|
||||
$default_locale = getLocaleTag();
|
||||
foreach ($public_pages as $locale_tag => $locale_pages) {
|
||||
setLocaleObject($locale_tag);
|
||||
foreach ($locale_pages as $page_name => $page_content) {
|
||||
$group_model->setPageName(C\ROOT_ID, C\PUBLIC_USER_ID, $page_name,
|
||||
$page_content, $locale_tag, "",
|
||||
tl('social_component_page_created', $page_name),
|
||||
tl('social_component_page_discuss_here'));
|
||||
}
|
||||
}
|
||||
//Insert Default Public Help pages
|
||||
foreach ($help_pages as $locale_tag => $locale_pages) {
|
||||
setLocaleObject($locale_tag);
|
||||
foreach ($locale_pages as $page_name => $page_content) {
|
||||
$group_model->setPageName(C\ROOT_ID, C\HELP_GROUP_ID, $page_name,
|
||||
$page_content, $locale_tag, "",
|
||||
tl('social_component_page_created', $page_name),
|
||||
tl('social_component_page_discuss_here'));
|
||||
}
|
||||
}
|
||||
setLocaleObject($default_locale);
|
||||
}
|
||||
/**
|
||||
* Checks to see if the database data or work_dir folder of Yioop! is from an
|
||||
* older version of Yioop! than the currently running Yioop!
|
||||
*/
|
||||
function upgradeDatabaseWorkDirectoryCheck()
|
||||
{
|
||||
$model = new M\Model();
|
||||
$sql = "SELECT ID FROM VERSION";
|
||||
for ($i = 0; $i < 3; $i++) {
|
||||
$result = @$model->db->execute($sql);
|
||||
if ($result !== false) {
|
||||
$row = $model->db->fetchArray($result);
|
||||
if ((isset($row['ID']) && $row['ID'] >= C\YIOOP_VERSION) ||
|
||||
(isset($row['id']) && $row['id'] >= C\YIOOP_VERSION)) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
sleep(3);
|
||||
}
|
||||
exit();
|
||||
}
|
||||
/**
|
||||
* If the database data of Yioop is older than the version of the
|
||||
* currently running Yioop then this function is called to try
|
||||
* upgrade the database to the new version
|
||||
*/
|
||||
function upgradeDatabaseWorkDirectory()
|
||||
{
|
||||
$model = new M\Model();
|
||||
$sql = "SELECT ID FROM VERSION";
|
||||
$result = @$model->db->execute($sql);
|
||||
if ($result !== false) {
|
||||
$row = $model->db->fetchArray($result);
|
||||
if (!empty($row['ID'])) {
|
||||
$current_version = min($row['ID'], C\YIOOP_VERSION);
|
||||
} else if (!empty($row['id'])) {
|
||||
$current_version = min($row['id'], C\YIOOP_VERSION);
|
||||
} else {
|
||||
$current_version = 1;
|
||||
}
|
||||
} else {
|
||||
exit(); // maybe someone else has locked DB, so bail
|
||||
}
|
||||
$result = null; //don't lock db if sqlite
|
||||
$versions = range(1, C\YIOOP_VERSION);
|
||||
$key = array_search($current_version, $versions);
|
||||
$versions = array_slice($versions, $key + 1);
|
||||
foreach ($versions as $version) {
|
||||
$upgrade_db = C\NS_LIB . "upgradeDatabaseVersion$version";
|
||||
if (function_exists($upgrade_db)) {
|
||||
$upgrade_db($model->db);
|
||||
}
|
||||
}
|
||||
updateVersionNumber($model->db, C\YIOOP_VERSION);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the database version number to a new number
|
||||
* @param object $db datasource for Yioop database
|
||||
* @param int $number the new database number
|
||||
*/
|
||||
function updateVersionNumber(&$db, $number)
|
||||
{
|
||||
$db->execute("DELETE FROM VERSION");
|
||||
$db->execute("INSERT INTO VERSION VALUES ($number)");
|
||||
}
|
||||
/**
|
||||
* Reads the Help articles from default db and returns the array of pages.
|
||||
*/
|
||||
function getWikiHelpPages()
|
||||
{
|
||||
$help_pages = [];
|
||||
$default_dbm = new D\Sqlite3Manager();
|
||||
$default_dbm->connect("", "", "", C\BASE_DIR . "/data/default.db");
|
||||
if (!$default_dbm) {
|
||||
return false;
|
||||
}
|
||||
$group_model = new M\GroupModel(C\DB_NAME, true);
|
||||
$group_model->db = $default_dbm;
|
||||
$page_list = $group_model->getPageList(
|
||||
C\HELP_GROUP_ID, "en-US", '', 0, 200);
|
||||
foreach ($page_list[1] as $page) {
|
||||
if (isset($page['TITLE'])) {
|
||||
$page_info = $group_model->getPageInfoByName(
|
||||
C\HELP_GROUP_ID, $page['TITLE'], "en-US", "api");
|
||||
$page_content = str_replace("&", "&", $page_info['PAGE']);
|
||||
$page_content = html_entity_decode($page_content, ENT_QUOTES,
|
||||
"UTF-8");
|
||||
$help_pages[$page['TITLE']] = $page_content;
|
||||
}
|
||||
}
|
||||
return $help_pages;
|
||||
}
|
||||
/**
|
||||
* Used to insert a new activity into the database at a given acitivity_id
|
||||
*
|
||||
* Inserting at an ID rather than at the end is useful since activities are
|
||||
* displayed in admin panel in order of increasing id.
|
||||
*
|
||||
* @param resource& $db database handle where Yioop database stored
|
||||
* @param string $string_id message identifier to give translations for
|
||||
* for activity
|
||||
* @param string $method_name admin_controller method to be called to perform
|
||||
* this activity
|
||||
* @param int $activity_id the id location at which to create this activity
|
||||
* activity at and below this location will be shifted down by 1.
|
||||
*/
|
||||
function addActivityAtId(&$db, $string_id, $method_name, $activity_id)
|
||||
{
|
||||
$db->execute("UPDATE ACTIVITY SET ACTIVITY_ID = ACTIVITY_ID + 1 WHERE ".
|
||||
"ACTIVITY_ID >= ?", [$activity_id]);
|
||||
$sql = "SELECT * FROM ACTIVITY WHERE ACTIVITY_ID >= ?
|
||||
ORDER BY ACTIVITY_ID DESC";
|
||||
$result = $db->execute($sql, [$activity_id]);
|
||||
while ($row = $db->fetchArray($result)) {
|
||||
$db->execute("INSERT INTO ACTIVITY VALUES (?, ?, ?)",
|
||||
[($row['ACTIVITY_ID'] + 1), $row['TRANSLATION_ID'],
|
||||
$row['METHOD_NAME']]);
|
||||
$db->execute("DELETE FROM ACTIVITY WHERE ACTIVITY_ID = ?",
|
||||
[$row['ACTIVITY_ID']]);
|
||||
}
|
||||
if (!in_array($method_name, ["manageAdvertisements", "manageCredits"])) {
|
||||
$db->execute("UPDATE ROLE_ACTIVITY SET ACTIVITY_ID = ACTIVITY_ID + 1 ".
|
||||
"WHERE ACTIVITY_ID >= ?", [$activity_id]);
|
||||
//give root account permissions on the activity.
|
||||
$db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, ?)",
|
||||
[$activity_id]);
|
||||
}
|
||||
$sql = "SELECT COUNT(*) AS NUM FROM TRANSLATION";
|
||||
$result = $db->execute($sql);
|
||||
if (!$result || !($row = $db->fetchArray($result))) {
|
||||
echo "Upgrade activity error";
|
||||
exit();
|
||||
}
|
||||
//some search id start at 1000, so +1001 ensures we steer clear of them
|
||||
$translation_id = $row['NUM'] + 1001;
|
||||
$db->execute("INSERT INTO ACTIVITY VALUES (?, ?, ?)",
|
||||
[$activity_id, $translation_id, $method_name]);
|
||||
$db->execute("INSERT INTO TRANSLATION VALUES (?, ?)",
|
||||
[$translation_id, $string_id]);
|
||||
}
|
||||
/**
|
||||
* Adds or replaces a translation for a database message string for a given
|
||||
* IANA locale tag.
|
||||
*
|
||||
* @param resource& $db database handle where Yioop database stored
|
||||
* @param string $string_id message identifier to give translation for
|
||||
* @param string $locale_tag the IANA language tag to update the strings of
|
||||
* @param string $translation the translation for $string_id in the language
|
||||
* $locale_tag
|
||||
*/
|
||||
function updateTranslationForStringId(&$db, $string_id, $locale_tag,
|
||||
$translation)
|
||||
{
|
||||
$sql = "SELECT LOCALE_ID FROM LOCALE ".
|
||||
"WHERE LOCALE_TAG = ? " . $db->limitOffset(1);
|
||||
$result = $db->execute($sql, [$locale_tag]);
|
||||
$row = $db->fetchArray($result);
|
||||
$locale_id = $row['LOCALE_ID'];
|
||||
|
||||
$sql = "SELECT TRANSLATION_ID FROM TRANSLATION ".
|
||||
"WHERE IDENTIFIER_STRING = ? " . $db->limitOffset(1);
|
||||
$result = $db->execute($sql, [$string_id]);
|
||||
$row = $db->fetchArray($result);
|
||||
$translate_id = $row['TRANSLATION_ID'];
|
||||
$sql = "DELETE FROM TRANSLATION_LOCALE ".
|
||||
"WHERE TRANSLATION_ID =? AND ".
|
||||
"LOCALE_ID = ?";
|
||||
$result = $db->execute($sql, [$translate_id, $locale_id]);
|
||||
$sql = "INSERT INTO TRANSLATION_LOCALE VALUES (?, ?, ?)";
|
||||
$result = $db->execute($sql, [$translate_id, $locale_id, $translation]);
|
||||
}
|
1045
src/library/UrlParser.php
Normal file
1045
src/library/UrlParser.php
Normal file
File diff suppressed because it is too large
Load diff
2464
src/library/Utility.php
Normal file
2464
src/library/Utility.php
Normal file
File diff suppressed because it is too large
Load diff
1350
src/library/VersionFunctions.php
Normal file
1350
src/library/VersionFunctions.php
Normal file
File diff suppressed because it is too large
Load diff
393
src/library/WebArchive.php
Normal file
393
src/library/WebArchive.php
Normal file
|
@ -0,0 +1,393 @@
|
|||
<?php
|
||||
/**
|
||||
* SeekQuarry/Yioop --
|
||||
* Open Source Pure PHP Search Engine, Crawler, and Indexer
|
||||
*
|
||||
* Copyright (C) 2009 - 2017 Chris Pollett chris@pollett.org
|
||||
*
|
||||
* LICENSE:
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* END LICENSE
|
||||
*
|
||||
* @author Chris Pollett chris@pollett.org
|
||||
* @license http://www.gnu.org/licenses/ GPL3
|
||||
* @link http://www.seekquarry.com/
|
||||
* @copyright 2009 - 2017
|
||||
* @filesource
|
||||
*/
|
||||
namespace seekquarry\yioop\library;
|
||||
|
||||
use seekquarry\yioop\configs as C;
|
||||
|
||||
/**
|
||||
* Loads crawlLog functions if needed
|
||||
*/
|
||||
require_once __DIR__."/Utility.php";
|
||||
/**
|
||||
*
|
||||
* Code used to manage web archive files
|
||||
*
|
||||
* @author Chris Pollett
|
||||
*/
|
||||
class WebArchive
|
||||
{
|
||||
/**
|
||||
* Filename used to store the web archive.
|
||||
* @var string
|
||||
*/
|
||||
public $filename;
|
||||
/**
|
||||
*
|
||||
* Current offset into the web archive the iterator for the archive is at
|
||||
* (at most one iterator / archive -- oh well)
|
||||
* @var int
|
||||
*/
|
||||
public $iterator_pos;
|
||||
/**
|
||||
* Filter object used to compress/uncompress objects stored in archive
|
||||
* @var object
|
||||
*/
|
||||
public $compressor;
|
||||
/**
|
||||
* number of item in archive
|
||||
* @var int
|
||||
*/
|
||||
public $count;
|
||||
/**
|
||||
* version number of the current archive
|
||||
* @var float
|
||||
*/
|
||||
public $version;
|
||||
/**
|
||||
* Says whether the archive is a string archive
|
||||
* @var bool
|
||||
*/
|
||||
public $is_string;
|
||||
/**
|
||||
* If archive is stored as a string rather than persistently to disk
|
||||
* then $storage is used to hold the string
|
||||
* @var string
|
||||
*/
|
||||
public $storage;
|
||||
/**
|
||||
* Version number to use in the WebArchive header if constructing a new
|
||||
* archive
|
||||
*/
|
||||
const WEB_ARCHIVE_VERSION = 1.0;
|
||||
/**
|
||||
* Makes or initializes a WebArchive object using the supplied parameters
|
||||
*
|
||||
* @param string $fname filename to use to store archive to disk
|
||||
* @param string $compressor what kind of Compressor object should be
|
||||
* used to read and write objects in the archive
|
||||
* @param bool $fast_construct do we read the info block of the web
|
||||
* archive as part of the constructing process
|
||||
* @param bool $is_string says whether the archive stores to string
|
||||
* rather than a file
|
||||
*/
|
||||
public function __construct($fname, $compressor, $fast_construct = false,
|
||||
$is_string = false)
|
||||
{
|
||||
$this->filename = $fname;
|
||||
$this->compressor = $compressor;
|
||||
$this->is_string = $is_string;
|
||||
if ($this->is_string) {
|
||||
$this->storage = "";
|
||||
$this->iterator_pos = 0;
|
||||
$this->count = 0;
|
||||
return;
|
||||
}
|
||||
if (file_exists($fname)) {
|
||||
if (!$fast_construct) {
|
||||
$this->readInfoBlock();
|
||||
}
|
||||
$this->iterator_pos = 0;
|
||||
} else {
|
||||
$this->iterator_pos = 0;
|
||||
$this->count = 0;
|
||||
$fh = fopen($this->filename, "w");
|
||||
$this->writeInfoBlock($fh);
|
||||
fclose($fh);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Read the info block associated with this web archive.
|
||||
* The info block is meta data for the archive stored at the end of
|
||||
* the WebArchive file. The particular meta is up to who is using
|
||||
* the web archive.
|
||||
* @return array the contents of the info block
|
||||
*/
|
||||
public function readInfoBlock()
|
||||
{
|
||||
if ($this->is_string) {
|
||||
return null;
|
||||
}
|
||||
$fh = fopen($this->filename, "r");
|
||||
$len = $this->seekEndObjects($fh);
|
||||
$info_string = fread($fh, $len);
|
||||
fclose($fh);
|
||||
$info_block = unserialize($this->compressor->uncompress($info_string));
|
||||
$this->count = $info_block["count"];
|
||||
$this->version = $info_block["version"];
|
||||
if (isset($info_block["data"])) {
|
||||
return $info_block["data"];
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Serializes and applies the compressor to an info block and write it at
|
||||
* the end of the web archive
|
||||
* The info block is meta data for the archive stored at the end of
|
||||
* the WebArchive file. The particular meta is up to who is using
|
||||
* the web archive; however, count and archive version number are always
|
||||
* stored
|
||||
*
|
||||
* @param resource $fh resource for the web archive file. If null
|
||||
* the web archive is open first and close when the data is written
|
||||
* @param array& $data data to write into the info block of the archive
|
||||
*/
|
||||
public function writeInfoBlock($fh = null, &$data = null)
|
||||
{
|
||||
if ($this->is_string) return;
|
||||
$compressed_int_len = $this->compressor->compressedIntLen();
|
||||
$open_flag = false;
|
||||
if ($fh == null) {
|
||||
$open_flag = true;
|
||||
$fh = fopen($this->filename, "r+");
|
||||
$this->seekEndObjects($fh);
|
||||
}
|
||||
$info_block = [];
|
||||
$info_block["version"] = self::WEB_ARCHIVE_VERSION;
|
||||
$info_block["count"] = $this->count;
|
||||
if ($data != null) {
|
||||
$info_block['data'] = & $data;
|
||||
}
|
||||
$info_string =
|
||||
$this->compressor->compress(serialize($info_block));
|
||||
$len = strlen($info_string) + $compressed_int_len;
|
||||
|
||||
$offset = ftell($fh);
|
||||
ftruncate($fh, $offset);
|
||||
|
||||
$out = $info_string.$this->compressor->compressInt($len);
|
||||
fwrite($fh, $out, $len);
|
||||
|
||||
if ($open_flag) {
|
||||
fclose($fh);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Seeks in the WebArchive file to the end of the last Object.
|
||||
*
|
||||
* The last $compressed_int_len bytes of a WebArchive say the length
|
||||
* of an info block in bytes
|
||||
*
|
||||
* @param resource $fh resource for the WebArchive file
|
||||
* @return int offset length of info block
|
||||
*/
|
||||
public function seekEndObjects($fh)
|
||||
{
|
||||
if ($this->is_string) {
|
||||
return strlen($this->storage);
|
||||
}
|
||||
$compressed_int_len = $this->compressor->compressedIntLen();
|
||||
fseek($fh, - $compressed_int_len, SEEK_END);
|
||||
$len_block = $this->compressor->uncompressInt(
|
||||
fread($fh, $compressed_int_len));
|
||||
fseek($fh, - ($len_block), SEEK_END);
|
||||
return $len_block - $compressed_int_len;
|
||||
}
|
||||
/**
|
||||
* Adds objects to the WebArchive
|
||||
*
|
||||
* @param string $offset_field field in objects to return the byte offset
|
||||
* at which they were stored
|
||||
* @param array& $objects references to objects that will be stored
|
||||
* the offset field in these references will be adjusted if
|
||||
* @param array $data data to write in the WebArchive's info block
|
||||
* @param string $callback name of a callback
|
||||
* $callback($data, $new_objects, $offset_field)
|
||||
* used to modify $data before it is written
|
||||
* to the info block. For instance, we can add offset info to data.
|
||||
* @param bool $return_flag if true rather than adjust the offsets by
|
||||
* reference, create copy objects and adjust their offsets anf return
|
||||
* @return mixed adjusted objects or void
|
||||
*/
|
||||
public function addObjects($offset_field, &$objects,
|
||||
$data = null, $callback = null, $return_flag = true)
|
||||
{
|
||||
$is_string = $this->is_string;
|
||||
if (!$is_string) {
|
||||
$fh = fopen($this->filename, "r+");
|
||||
$this->seekEndObjects($fh);
|
||||
$offset = ftell($fh);
|
||||
ftruncate($fh, $offset);
|
||||
} else {
|
||||
$offset = strlen($this->storage);
|
||||
}
|
||||
$out = "";
|
||||
if ($return_flag) {
|
||||
$new_objects = $objects;
|
||||
} else {
|
||||
$new_objects = & $objects;
|
||||
}
|
||||
$num_objects = count($new_objects);
|
||||
$compressed_int_len = $this->compressor->compressedIntLen();
|
||||
for ($i = 0; $i < $num_objects; $i++) {
|
||||
$new_objects[$i][$offset_field] = $offset;
|
||||
$file = serialize($new_objects[$i]);
|
||||
$compressed_file = $this->compressor->compress($file);
|
||||
$len = strlen($compressed_file);
|
||||
$out .= $this->compressor->compressInt($len) . $compressed_file;
|
||||
$offset += $len + $compressed_int_len;
|
||||
}
|
||||
$this->count += $num_objects;
|
||||
if ($is_string) {
|
||||
$this->storage .= $out;
|
||||
} else {
|
||||
fwrite($fh, $out, strlen($out));
|
||||
}
|
||||
if ($data != null && $callback != null) {
|
||||
$data = $callback($data, $new_objects, $offset_field);
|
||||
}
|
||||
if (!$is_string) {
|
||||
$this->writeInfoBlock($fh, $data);
|
||||
fclose($fh);
|
||||
}
|
||||
if ($return_flag) {
|
||||
return $new_objects;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Open the web archive file associated with this WebArchive object.
|
||||
*
|
||||
* @param string $mode read/write mode to open file with
|
||||
* @return resource a file resource for the web archive
|
||||
*/
|
||||
public function open($mode = "r")
|
||||
{
|
||||
if ($this->is_string) {
|
||||
return "is_string";
|
||||
}
|
||||
$fh = fopen($this->filename, $mode);
|
||||
return $fh;
|
||||
}
|
||||
/**
|
||||
* Closes a file handle (which should be of a web archive)
|
||||
*
|
||||
* @param resource $fh filehandle to close
|
||||
*/
|
||||
public function close($fh)
|
||||
{
|
||||
if ($this->is_string) return;
|
||||
fclose($fh);
|
||||
}
|
||||
/**
|
||||
* Gets $num many objects out of the web archive starting at byte $offset
|
||||
*
|
||||
* If the $next_flag is true the archive iterator is advance and if $fh
|
||||
* is not null then it is assumed to be an open resource pointing to the
|
||||
* archive (saving the time to open it).
|
||||
*
|
||||
* @param int $offset a valid byte offset into a web archive
|
||||
* @param int $num number of objects to return
|
||||
* @param bool $next_flag whether to advance the archive iterator
|
||||
* @param resource $fh either null or a file resource to the archive
|
||||
* @return array the $num objects beginning at $offset
|
||||
*/
|
||||
public function getObjects($offset, $num, $next_flag = true, $fh = null)
|
||||
{
|
||||
$open_flag = false;
|
||||
if ($fh == null) {
|
||||
$fh = $this->open();
|
||||
$open_flag = true;
|
||||
}
|
||||
$is_string = $this->is_string;
|
||||
$objects = [];
|
||||
$compressed_int_len = $this->compressor->compressedIntLen();
|
||||
if ($is_string) {
|
||||
$storage_len = strlen($this->storage);
|
||||
}
|
||||
if ((!$is_string &&fseek($fh, $offset) == 0 ) || ($is_string
|
||||
&& $offset < $storage_len)) {
|
||||
for ($i = 0; $i < $num; $i++) {
|
||||
if (!$is_string && feof($fh)) {break; }
|
||||
if ($is_string && $offset >= $storage_len) {break; }
|
||||
$object = null;
|
||||
$compressed_len = ($is_string)
|
||||
? substr($this->storage, $offset, $compressed_int_len)
|
||||
: fread($fh, $compressed_int_len);
|
||||
$len = $this->compressor->uncompressInt($compressed_len);
|
||||
if ($len > 0 && $len < C\MAX_ARCHIVE_OBJECT_SIZE) {
|
||||
$compressed_file = ($is_string)
|
||||
? substr($this->storage, $offset + $compressed_int_len,
|
||||
$len)
|
||||
: fread($fh, $len);
|
||||
restore_error_handler();
|
||||
$file = $this->compressor->uncompress($compressed_file);
|
||||
$object = @unserialize($file);
|
||||
set_error_handler(C\NS_LIB . "yioop_error_handler");
|
||||
$offset += $compressed_int_len + $len;
|
||||
$objects[] = [$offset, $object];
|
||||
} else {
|
||||
crawlLog("Web archive saw blank line ".
|
||||
"when looked for offset $offset");
|
||||
}
|
||||
}
|
||||
if ($next_flag) {
|
||||
$this->iterator_pos = $offset;
|
||||
}
|
||||
}
|
||||
if ($open_flag) {
|
||||
$this->close($fh);
|
||||
}
|
||||
return $objects;
|
||||
}
|
||||
/**
|
||||
* Returns $num many objects from the web archive starting at the current
|
||||
* iterator position, leaving the iterator position unchanged
|
||||
*
|
||||
* @param int $num number of objects to return
|
||||
* @return array an array of objects from the web archive
|
||||
*/
|
||||
public function currentObjects($num)
|
||||
{
|
||||
return $this->getObjects($this->iterator_pos, $num, false);
|
||||
}
|
||||
/**
|
||||
* Returns $num many objects from the web archive starting at the
|
||||
* current iterator position. The iterator is advance to the object
|
||||
* after the last one returned
|
||||
*
|
||||
* @param int $num number of objects to return
|
||||
* @return array an array of objects from the web archive
|
||||
*/
|
||||
public function nextObjects($num)
|
||||
{
|
||||
return $this->getObjects($this->iterator_pos, $num);
|
||||
}
|
||||
/**
|
||||
* Resets the iterator for this web archive to the first object
|
||||
* in the archive
|
||||
*/
|
||||
public function reset()
|
||||
{
|
||||
$this->iterator_pos = 0;
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue