init:yioop-4.0.1

2018-04-16 20:30:08 +08:00 · 2018-04-16 20:30:08 +08:00 · 3b97f4ec02
commit 3b97f4ec02
519 changed files with 186218 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+local_config.php
+LocalConfig.php
+.DS_Store
+/vendor/
+/work_directory/
--- a/.htaccess
+++ b/.htaccess
@ -0,0 +1,12 @@
+Options +FollowSymLinks
+RewriteEngine On
+
+RewriteRule "^wd/(css|scripts|locale)/(.*)$" work_directory/app/$1/$2 [L]
+RewriteRule "^wd/resources/(.*)/(.*)/(.*)/(.*)/(.*)$" index.php?c=resource&a=get&f=resources&$1&g=$2&p=$3&sf=$4&n=$5 [L]
+RewriteRule "^wd/resources/(.*)/(.*)/(.*)/(.*)$" index.php?c=resource&a=get&f=resources&$1&g=$2&p=$3&n=$4 [L]
+
+RewriteRule "^(css|resources|scripts|locale)/(.*)$" src/$1/$2 [L]
+RewriteRule "^((.*)bar.xml|favicon.ico|robots.txt)$" src/$1 [L]
+
+RewriteCond %{REQUEST_FILENAME} !index.php|tests/.*$|(src|work_directory/app)/(favicon.ico$|robots.txt$|yioopbar.xml$|(css|scripts|resources/.*$|locale/.*$))
+RewriteRule ^ index.php [L]
--- a/51
+++ b/51
@ -0,0 +1,51 @@
+SeekQuarry/Yioop --
+Open Source Pure PHP Search Engine, Crawler, and Indexer
+
+Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
+
+http://www.seekquarry.com/
+
+LICENSE:
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+What follows is a brief summary of how to install Yioop!
+More details about installation and configuration (including screenshots)
+can be found at:
+http://www.seekquarry.com/?c=main&p=documentation
+
+Installation
+-------------
+(1) Move the Yioop search engine into some folder under your
+web server's document root.
+
+(2)管理账号：root 密码为空
+
+(3)
+
+(4) To start a crawl, you need to point your browser at the url
+of your Yioop installation. Click on the admin link,
+then the manage crawl link. Type in a description of your
+crawl and click "Start New Crawl". After about a minute you should
+start the Currently Processing and Most Recent Urls sections updating
+with info about the current crawl. 
+
+(5) After running your crawl for a while you can click the Stop
+button to stop it. The crawl should show up after a delay in the
+Previous Crawls table. Their you can choose to resume a crawl,
+delete it, or set it as the current index.
+
+(6) If you set a crawl as the current index, when you go
+back to your installations web page and type a query in the search bar,
+the query will be answered with the results from that crawl.
--- a/683
+++ b/683
@ -0,0 +1,683 @@
+SeekQuarry/Yioop --
+Open Source Pure PHP Search Engine, Crawler, and Indexer
+
+Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
+
+SeekQuarry is distributed under the terms of GNU LIBRARY GENERAL PUBLIC
+LICENSE reproduced below.
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
+
--- a/72
+++ b/72
@ -0,0 +1,72 @@
+SeekQuarry/Yioop --
+Open Source Pure PHP Search Engine, Crawler, and Indexer
+
+Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
+
+http://www.seekquarry.com/
+
+LICENSE:
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+Summary
+-------
+The Yioop search engine consists of three main
+scripts: 
+
+src/executables/Fetcher.php - used to download batches of urls provided
+    the queue_server.
+src/executables/queue_server.php - maintains a queue of urls that are
+    going to be scheduled to be seen. It also keeps
+    track of what has been seen and robots.txt info.
+    Its last responsibility is to create the index_archive
+    that is used by the search front end.
+
+index.php -- a search engine web page. It is also used
+    to handle message passing between the fetchers
+    (multiple machines can act as fetchers) and the
+    queue_server.
+
+Download
+--------
+You can download the SeekQuarry search engine from
+http://www.seekquarry.com/
+
+Requirements
+------------
+The Yioop search engine requires Apache and 
+PHP. It was developed under Apache 2.2, PHP 5.4,
+and the sqlite3 built into PHP.
+
+Credits
+------
+The source code is mainly due to Chris Pollett.
+Other contributors include: Mangesh Dahale, Ravi Dhillon, Priya Gangaraju,
+Akshat Kukreti, Pooja Mishra, Sreenidhi Pundi Muralidharan,
+Nakul Natu, Shailesh Padave, Vijaya Pamidi, Snigdha Parvatneni,
+Akash Patel, Vijeth Patil, Mallika Perepa, Tarun Pepira,
+Eswara Rajesh Pinapala, Tamayee Potluri, Shawn Tice, Pushkar Umaranikar,
+Sandhya Vissapragada. Several people helped with localization: 
+My wife, Mary Pollett, Jonathan Ben-David, Ismail.B, Andrea Brunetti,
+Thanh Bui, Sujata Dongre, Animesh Dutta, Aida Khosroshahi, Radha Kotipalli,
+Youn Kim, Akshat Kukreti, Chao-Hsin Shih, Ahmed Kamel Taha, and Sugi Widjaja.
+
+Installation
+-------------
+Please see the INSTALL file
+
+Documentation and Support
+-------------------------
+Please check out seekquarry.com
--- a/composer.json
+++ b/composer.json
@ -0,0 +1,34 @@
+{
+    "name": "seekquarry/yioop",
+    "description": "Search Engine Portal with Front-end, Indexer, Crawler, Wiki, and NLP libraries",
+    "homepage": "https://www.seekquarry.com/",
+    "license": "GPLv3",
+    "authors": [
+        {
+            "name": "Chris Pollett",
+            "email": "chris@pollett.org"
+        }
+    ],
+    "minimum-stability": "stable",
+    "require": {
+        "php": ">=5.4.0",
+        "ext-dom": "*",
+        "ext-gd": "*",
+        "ext-json": "*",
+        "ext-mbstring": "*",
+        "ext-pcre": "*",
+        "ext-PDO":"*",
+        "ext-pdo_sqlite": "*",
+        "ext-SPL":"*",
+        "ext-zip": "*",
+        "lib-curl": "*",
+        "lib-libxml": "*",
+        "lib-pcre": "*"
+    },
+    "autoload": {
+        "psr-4": {
+            "seekquarry\\yioop\\": ["work_directory/app/", "src/"],
+            "seekquarry\\yioop\\tests\\": ["tests/"]
+        }
+    }
+}
--- a/composer.lock
+++ b/composer.lock
@ -0,0 +1,32 @@
+{
+    "_readme": [
+        "This file locks the dependencies of your project to a known state",
+        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
+        "This file is @generated automatically"
+    ],
+    "hash": "a9a653eb7781a772e1ab492dd4eee5e8",
+    "content-hash": "9a00ca22dcc4a39f7ecfbdf2444131f2",
+    "packages": [],
+    "packages-dev": [],
+    "aliases": [],
+    "minimum-stability": "stable",
+    "stability-flags": [],
+    "prefer-stable": false,
+    "prefer-lowest": false,
+    "platform": {
+        "php": ">=5.4.0",
+        "ext-dom": "*",
+        "ext-gd": "*",
+        "ext-json": "*",
+        "ext-mbstring": "*",
+        "ext-pcre": "*",
+        "ext-pdo": "*",
+        "ext-pdo_sqlite": "*",
+        "ext-spl": "*",
+        "ext-zip": "*",
+        "lib-curl": "*",
+        "lib-libxml": "*",
+        "lib-pcre": "*"
+    },
+    "platform-dev": []
+}
--- a/index.php
+++ b/index.php
@ -0,0 +1,48 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop;
+
+/**
+ * Sends a request from the outer Yioop index.php file on to the inner one
+ * Marks that redirects are on
+ */
+function passthruYioopRequest()
+{
+    $uri = $_SERVER['REQUEST_URI'];
+    $new_uri = preg_replace("@/(\.?/)+@", "/", $uri);
+    if ($new_uri != $uri) {
+        header("Location: $new_uri", true, 301);
+        exit();
+    }
+    define("seekquarry\\yioop\\configs\\REDIRECTS_ON", true);
+    require_once __DIR__."/src/index.php";
+}
+passthruYioopRequest();
--- a/src/advertise.php
+++ b/src/advertise.php
@ -0,0 +1,44 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Web page used to display information about the privacy policy of
+ * the SeekQuarry/Yioop Search engine
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop;
+
+/* set-up static privacy page for display */
+$_REQUEST['c'] = "static";
+$_REQUEST['p'] = "advertise";
+/**
+ * load in main entry point
+ */
+require_once(__DIR__."/index.php");
+bootstrap();
+exit();
--- a/src/blog.php
+++ b/src/blog.php
@ -0,0 +1,42 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Web page used to display a simple blog about current crawls related to
+ * the SeekQuarry/Yioop Search engine
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop;
+
+$_REQUEST['c'] = "group";
+$_REQUEST['a'] = "groupFeeds";
+$_REQUEST['just_group_id'] = 2;
+define('seekquarry\\yioop\\configs\\SKIP_BOOTSTRAP', true);
+require_once(__DIR__."/index.php");
+bootstrap();
+exit();
--- a/src/bot.php
+++ b/src/bot.php
@ -0,0 +1,41 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Web page used to display information about the web robot used in
+ * the SeekQuarry/Yioop Search engine
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop;
+
+$_REQUEST['c'] = "static";
+$_REQUEST['p'] = "bot";
+define('seekquarry\\yioop\\configs\\SKIP_BOOTSTRAP', true);
+require_once(__DIR__."/index.php");
+bootstrap();
+exit();
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@ -0,0 +1,944 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Used to set the configuration settings of the Yioop/SeekQuarry project.
+ *
+ * Some settings can be set in the Page Options and Server Settings
+ * and Appearance activities. Other settings can be overriden by making
+ * a LocalConfig.php file in the same folder as this file and using the
+ * same namespace.  If a setting in this file is created using nsdefine
+ * it is unlikely that it is safe to override. If it is created using
+ * nsconddefine it should be fair game for tweaking in the LocalConfig.php
+ * file
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\configs;
+
+/**
+ * So can autoload classes. We try to use the autoloader that
+ * Composer would define but if that fails we use a default autoloader
+ */
+if (file_exists(__DIR__."/../../vendor/autoload.php")) {
+    require_once __DIR__."/../../vendor/autoload.php";
+} else {
+    spl_autoload_register(function ($class) {
+        // project-specific namespace prefix
+        $prefix = 'seekquarry\\yioop\\tests';
+        // does the class use the namespace prefix?
+        $len = strlen($prefix);
+        if (strncmp($prefix, $class, $len) !== 0) {
+            $prefix = 'seekquarry\\yioop';
+            $len = strlen($prefix);
+            // no, move to the next registered autoloader
+            if (strncmp($prefix, $class, $len) !== 0) {
+                return;
+            } else {
+                $check_dirs = [WORK_DIRECTORY . "/app", BASE_DIR];
+            }
+        } else {
+            $check_dirs = [PARENT_DIR . "/tests"];
+        }
+        // get the relative class name
+        $relative_class = substr($class, $len);
+        // use forward-slashes, add ./php
+        $unixify_class_name = "/".str_replace('\\', '/', $relative_class) .
+            '.php';
+        foreach ($check_dirs as $dir) {
+            $file = $dir . $unixify_class_name;
+            if (file_exists($file)) {
+                require $file;
+                break;
+            }
+        }
+    });
+}
+/**
+ * Define a constant in the Yioop configs namespace (seekquarry\yioop)
+ * @param string $constant the name of the constant to define
+ * @param $value the value to give it
+ */
+function nsdefine($constant, $value)
+{
+    define("seekquarry\\yioop\\configs\\" . $constant, $value);
+}
+/**
+ * Check if a constant has been defined in the yioop configuration
+ * namespace.
+ * @param string $constant the constant to check if defined
+ * @return bool whether or not it was
+ */
+function nsdefined($constant)
+{
+    return defined("seekquarry\\yioop\\configs\\" . $constant);
+}
+/**
+ * Define a constant in the Yioop configs namespace (seekquarry\yioop)
+ * if it hasn't been defined yet, otherwise do nothing.
+ * @param string $constant the name of the constant to define
+ * @param $value the value to give it
+ */
+function nsconddefine($constant, $value)
+{
+    if (!defined("seekquarry\\yioop\\configs\\" . $constant)) {
+        define("seekquarry\\yioop\\configs\\" . $constant, $value);
+    }
+}
+/**
+ * Version number for upgrade function
+ * @var int
+ */
+nsdefine('YIOOP_VERSION', 49);
+/**
+ * Minimum Version fo Yioop for which keyword ad script
+ * still works with this version
+ * @var int
+ */
+nsdefine('MIN_AD_VERSION', 36);
+/**
+ * nsdefine's the BASE_URL constant for this script
+ */
+function computeBaseUrl()
+{
+    $pathinfo = pathinfo($_SERVER['SCRIPT_NAME']);
+    $server_port = isset($_SERVER['HTTP_X_FORWARDED_PORT']) ?
+        $_SERVER['HTTP_X_FORWARDED_PORT'] : (isset($_SERVER['SERVER_PORT']) ?
+            $_SERVER['SERVER_PORT'] : 80);
+    $http = (!empty($_SERVER['HTTPS']) || $server_port == 443) ?
+        "https://" : "http://";
+    $port = ( ($http == "http://" && ($server_port != 80) ||
+        ($http == "https://" && $server_port != 443))) ?
+        ":" . $server_port : "";
+    $server_name = isset($_SERVER['SERVER_NAME']) ? $_SERVER['SERVER_NAME'] :
+        "localhost";
+    $dir_name = $pathinfo["dirname"];
+    if ($dir_name == ".") {
+        $dir_name = "";
+    }
+    $extra_slash = ($dir_name == '/') ? "" : '/';
+    //used in register controller to create links back to server
+    nsdefine("BASE_URL", $http . $server_name . $port . $dir_name .
+        $extra_slash);
+}
+/*
+    pcre is an external library to php which can cause Yioop
+    to seg fault if given instances of reg expressions with
+    large recursion depth on a string.
+    https://bugs.php.net/bug.php?id=47376
+    The goal here is to cut off these problems before they happen.
+    We do this in config.php because it is included in most Yioop
+    files.
+ */
+ini_set('pcre.recursion_limit', 3000);
+ini_set('pcre.backtrack_limit', 1000000);
+    /** Calculate base directory of script
+     * @ignore
+     */
+nsconddefine("BASE_DIR", str_replace("\\", "/", realpath(__DIR__ ."/../")));
+nsconddefine("PARENT_DIR",  substr(BASE_DIR, 0, -strlen("/src")));
+computeBaseUrl();
+/** Yioop Namespace*/
+nsdefine('NS', "seekquarry\\yioop\\");
+/** controllers sub-namespace */
+nsdefine('NS_CONFIGS', NS . "configs\\");
+/** controllers sub-namespace */
+nsdefine('NS_CONTROLLERS', NS . "controllers\\");
+/** components sub-namespace */
+nsdefine('NS_COMPONENTS', NS_CONTROLLERS . "components\\");
+/** executables sub-namespace */
+nsdefine('NS_EXEC', NS . "executables\\");
+/** library sub-namespace */
+nsdefine('NS_LIB', NS . "library\\");
+/** jobs sub-namespace */
+nsdefine('NS_JOBS', NS_LIB . "media_jobs\\");
+/** Models sub-namespace */
+nsdefine('NS_MODELS', NS . "models\\");
+/** datasources sub-namespace */
+nsdefine('NS_DATASOURCES', NS_MODELS . "datasources\\");
+/** archive_bundle_iterators sub-namespace */
+nsdefine('NS_ARCHIVE', NS_LIB . "archive_bundle_iterators\\");
+/** indexing_plugins sub-namespace */
+nsdefine('NS_PLUGINS', NS_LIB . "indexing_plugins\\");
+/** indexing_plugins sub-namespace */
+nsdefine('NS_PROCESSORS', NS_LIB . "processors\\");
+/** text sumamrizer sub-namespace */
+nsdefine('NS_SUMMARIZERS', NS_LIB . "summarizers\\");
+/** locale sub-namespace */
+nsdefine('NS_LOCALE', NS . "locale\\");
+/** views sub-namespace */
+nsdefine('NS_VIEWS', NS . "views\\");
+/** elements sub-namespace */
+nsdefine('NS_ELEMENTS', NS_VIEWS . "elements\\");
+/** helpers sub-namespace */
+nsdefine('NS_HELPERS', NS_VIEWS . "helpers\\");
+/** layouts sub-namespace */
+nsdefine('NS_LAYOUTS', NS_VIEWS . "layouts\\");
+/** tests sub-namespace */
+nsdefine('NS_TESTS', NS . "tests\\");
+/** Don't display any query info*/
+nsdefine('NO_DEBUG_INFO', 0);
+/** bit of DEBUG_LEVEL used to indicate test cases should be displayable*/
+nsdefine('TEST_INFO', 1);
+/** bit of DEBUG_LEVEL used to indicate query statistics should be displayed*/
+nsdefine('QUERY_INFO', 2);
+/** bit of DEBUG_LEVEL used to indicate php messages should be displayed*/
+nsdefine('ERROR_INFO', 4);
+/** Maintenance mode restricts access to local machine*/
+nsdefine("MAINTENANCE_MODE", false);
+/** Constant used to indicate lasting an arbitrary number of seconds */
+nsdefine('FOREVER', -2);
+/** Number of seconds in a day*/
+nsdefine('ONE_DAY', 86400);
+/** Number of seconds in a week*/
+nsdefine('ONE_WEEK', 604800);
+/** Number of seconds in a 30 day month */
+nsdefine('ONE_MONTH', 2592000);
+/** Number of seconds in a 365 day year */
+nsdefine('ONE_YEAR',  31536000);
+/** Number of seconds in an hour */
+nsdefine('ONE_HOUR', 3600);
+/** Number of seconds in a minute */
+nsdefine('ONE_MINUTE', 60);
+/** Number of seconds in a second */
+nsdefine('ONE_SECOND', 1);
+if (file_exists(BASE_DIR."/configs/LocalConfig.php")) {
+    /** Include any locally specified defines (could use as an alternative
+        way to set work directory) */
+    require_once(BASE_DIR."/configs/LocalConfig.php");
+}
+/** setting Profile.php to something else in LocalConfig.php allows one to have
+ *  two different yioop instances share the same work_directory but maybe have
+ *  different configuration settings. This might be useful if one was
+ *  production and one was more dev.
+ */
+nsconddefine('PROFILE_FILE_NAME', "/Profile.php");
+nsconddefine('MAINTENANCE_MESSAGE', <<<EOD
+This Yioop! installation is undergoing maintenance, please come back later!
+EOD
+);
+if (MAINTENANCE_MODE && $_SERVER["SERVER_ADDR"] != $_SERVER["REMOTE_ADDR"]) {
+    echo MAINTENANCE_MESSAGE;
+    exit();
+}
+
+/** */
+nsdefine('DEFAULT_WORK_DIRECTORY', PARENT_DIR . "/work_directory");
+
+if (!nsdefined('WORK_DIRECTORY')) {
+/*+++ The next block of code is machine edited, change at
+your own risk, please use configure web page instead +++*/
+nsdefine('WORK_DIRECTORY', DEFAULT_WORK_DIRECTORY);
+/*++++++*/
+// end machine edited code
+}
+/** Directory for local versions of web app classes*/
+nsdefine('APP_DIR', WORK_DIRECTORY."/app");
+/** Directory to place files such as dictionaries that will be
+   converted to Bloom filter using token_tool.php. Similarly,
+   can be used to hold files which will be used to prepare
+   a file to assist in crawling or serving search results
+*/
+nsdefine('PREP_DIR', WORK_DIRECTORY."/prepare");
+/** Locale dir to use in case LOCALE_DIR does not exist yet or is
+ * missing some file
+ */
+nsdefine('FALLBACK_LOCALE_DIR', BASE_DIR."/locale");
+/** Captcha mode indicating to use a text captcha*/
+nsdefine('TEXT_CAPTCHA', 1);
+/** Captcha mode indicating to use a hash cash computation for a captcha*/
+nsdefine('HASH_CAPTCHA', 2);
+/** Captcha mode indicating to use a classic image based captcha*/
+nsdefine('IMAGE_CAPTCHA', 3);
+/** Authentication Mode Possibility*/
+nsdefine('NORMAL_AUTHENTICATION', 1);
+/** Authentication Mode Possibility*/
+nsdefine('ZKP_AUTHENTICATION', 2);
+/** */
+nsdefine('NO_RECOVERY', 0);
+/** */
+nsdefine('EMAIL_RECOVERY', 1);
+/** */
+nsdefine('EMAIL_AND_QUESTIONS_RECOVERY', 2);
+/** If ZKP Authentication via Fiat Shamir Protocol used how many iterations
+ * to do
+ */
+nsconddefine('FIAT_SHAMIR_ITERATIONS', 20);
+if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) {
+    if ((file_exists(WORK_DIRECTORY . "/locale/en-US") &&
+        !file_exists(WORK_DIRECTORY . "/locale/en_US"))
+        || (file_exists(WORK_DIRECTORY . "/app/locale/en-US") &&
+        !file_exists(WORK_DIRECTORY . "/app/locale/en_US"))) {
+        $old_profile = file_get_contents(WORK_DIRECTORY . PROFILE_FILE_NAME);
+        $new_profile = preg_replace('/\<\?php/', "<?php\n".
+            "namespace seekquarry\\yioop\\configs;\n",
+            $old_profile);
+        $new_profile = preg_replace("/(define(?:d?))\(/", 'ns$1(',
+            $new_profile);
+        file_put_contents(WORK_DIRECTORY . PROFILE_FILE_NAME, $new_profile);
+    }
+    require_once WORK_DIRECTORY . PROFILE_FILE_NAME;
+    nsdefine('PROFILE', true);
+    nsdefine('CRAWL_DIR', WORK_DIRECTORY);
+    if (is_dir(APP_DIR."/locale")) {
+        nsdefine('LOCALE_DIR', WORK_DIRECTORY."/app/locale");
+    } else if (is_dir(WORK_DIRECTORY."/locale")) {
+        //old work directory location
+        nsdefine('LOCALE_DIR', WORK_DIRECTORY."/locale");
+    } else {
+        /** @ignore */
+        nsdefine('LOCALE_DIR', FALLBACK_LOCALE_DIR);
+    }
+    nsdefine('LOG_DIR', WORK_DIRECTORY."/log");
+    if (nsdefined('DB_URL') && !nsdefined('DB_HOST')) {
+        nsdefine('DB_HOST', DB_URL); //for backward compatibility
+    }
+    if (nsdefined('QUEUE_SERVER') && !nsdefined('NAME_SERVER')) {
+        nsdefine('NAME_SERVER', QUEUE_SERVER); //for backward compatibility
+    }
+    if (NAME_SERVER == 'http://' || NAME_SERVER == 'https://') {
+        nsdefine("FIX_NAME_SERVER", true);
+    }
+} else {
+    if ((!isset( $_SERVER['SERVER_NAME']) ||
+        $_SERVER['SERVER_NAME']!=='localhost')
+        && !nsdefined("NO_LOCAL_CHECK") && !nsdefined("WORK_DIRECTORY")
+        && php_sapi_name() != 'cli' ) {
+        echo "SERVICE AVAILABLE ONLY VIA LOCALHOST UNTIL CONFIGURED";
+        exit();
+    }
+    /** @ignore */
+    nsdefine('PROFILE', false);
+    nsdefine('DBMS', 'Sqlite3');
+    nsdefine('AUTHENTICATION_MODE', NORMAL_AUTHENTICATION);
+    nsdefine('RECOVERY_MODE', EMAIL_RECOVERY);
+    nsdefine('DEBUG_LEVEL', NO_DEBUG_INFO);
+    nsdefine('USE_FILECACHE', false);
+    nsdefine('WEB_ACCESS', true);
+    nsdefine('RSS_ACCESS', true);
+    nsdefine('API_ACCESS', true);
+    nsdefine('REGISTRATION_TYPE', 'disable_registration');
+    nsdefine('USE_MAIL_PHP', true);
+    nsdefine('MAIL_SERVER', '');
+    nsdefine('MAIL_PORT', '');
+    nsdefine('MAIL_USERNAME', '');
+    nsdefine('MAIL_PASSWORD', '');
+    nsdefine('MAIL_SECURITY', '');
+    nsdefine('MEDIA_MODE', 'name_server');
+    nsdefine('DB_NAME', "default");
+    nsdefine('DB_USER', '');
+    nsdefine('DB_PASSWORD', '');
+    nsdefine('DB_HOST', '');
+    /** @ignore */
+    nsdefine('CRAWL_DIR', BASE_DIR);
+    /** @ignore */
+    nsdefine('LOCALE_DIR', FALLBACK_LOCALE_DIR);
+    /** @ignore */
+    nsdefine('LOG_DIR', BASE_DIR."/log");
+    nsdefine('NAME_SERVER', "http://localhost/");
+    nsdefine('USER_AGENT_SHORT', "NeedsNameBot");
+    nsdefine('DEFAULT_LOCALE', "en-US");
+    nsdefine('AUTH_KEY', 0);
+    nsdefine('USE_MEMCACHE', false);
+    nsdefine('USE_PROXY', false);
+    nsdefine('TOR_PROXY', '127.0.0.1:9150');
+    nsdefine('PROXY_SERVERS', null);
+    nsdefine('WORD_SUGGEST', true);
+    nsdefine('CACHE_LINK', true);
+    nsdefine('SIMILAR_LINK', true);
+    nsdefine('IN_LINK', true);
+    nsdefine('IP_LINK', true);
+    nsdefine('RESULT_SCORE', true);
+    nsdefine('SIGNIN_LINK', true);
+    nsdefine('NEWS_MODE', 'news_off');
+    /** BM25F weight for title text */
+    nsdefine ('TITLE_WEIGHT', 4);
+    /** BM25F weight for other text within doc*/
+    nsdefine ('DESCRIPTION_WEIGHT', 1);
+    /** BM25F weight for other text within links to a doc*/
+    nsdefine ('LINK_WEIGHT', 2);
+    /** If that many exist, the minimum number of results to get
+        and group before trying to compute the top x (say 10) results
+     */
+    nsdefine ('MIN_RESULTS_TO_GROUP', 200);
+    /** For a given number of search results total to return (total_num)
+        server_alpha*total_num/num_servers will be returned any a given
+        queue server machine*/
+    nsdefine ('SERVER_ALPHA', 1.6);
+    nsdefine('BACKGROUND_COLOR', "#FFFFFF");
+    nsdefine('FOREGROUND_COLOR', "#FFFFFF");
+    nsdefine('SIDEBAR_COLOR', "#88AA44");
+    nsdefine('TOPBAR_COLOR', "#EEEEFF");
+    nsdefine('AD_LOCATION','none');
+}
+/** URL that all url paths will be constructed from */
+nsconddefine('BASE_URL', NAME_SERVER);
+/** Relative url to website logo */
+nsconddefine('LOGO', "resources/yioop.png");
+/** Relative url to mobile website logo */
+nsconddefine('M_LOGO', "resources/m-yioop.png");
+/** Url for website favicon */
+nsconddefine('FAVICON', BASE_URL . "favicon.ico");
+/** Timezone for website */
+nsconddefine('TIMEZONE', 'America/Los_Angeles');
+/* name of the cookie used to manage the session
+   (store language and perpage settings), define CSRF token
+ */
+nsconddefine('SESSION_NAME', "yioopbiscuit");
+nsconddefine('CSRF_TOKEN', "YIOOP_TOKEN");
+/** locations that ads can be placed in search result pages */
+nsconddefine('AD_LOCATION', "none");
+date_default_timezone_set(TIMEZONE);
+if ((DEBUG_LEVEL & ERROR_INFO) == ERROR_INFO) {
+    error_reporting(-1);
+} else {
+    error_reporting(0);
+}
+/** if true tests are diplayable*/
+nsdefine('DISPLAY_TESTS', ((DEBUG_LEVEL & TEST_INFO) == TEST_INFO));
+/** if true query statistics are diplayed */
+nsconddefine('QUERY_STATISTICS', ((DEBUG_LEVEL & QUERY_INFO) == QUERY_INFO));
+//check if mobile css and formatting should be used or not
+if (isset($_SERVER['HTTP_USER_AGENT'])) {
+    $agent = $_SERVER['HTTP_USER_AGENT'];
+    if ((stristr($agent, "mobile") || stristr($agent, "fennec")) &&
+        !stristr($agent, "ipad") ) {
+        nsdefine("MOBILE", true);
+    } else {
+        nsdefine("MOBILE", false);
+    }
+} else {
+    nsdefine("MOBILE", false);
+}
+/*
+ * Various groups and user ids. These must be nsdefined before the
+ * profile check and return below
+ */
+/** ID of the root user */
+nsdefine('ROOT_ID', 1);
+/**User name of the root user. If you want to change this, change
+ the value in LocalConfig.php, then run the Createdb.php script. You
+ should do this before you have much data in your system. */
+nsconddefine('ROOT_USERNAME', "root");
+/** Role of the root user */
+nsdefine('ADMIN_ROLE', 1);
+/** Default role of an active user */
+nsdefine('USER_ROLE', 2);
+/** Default role of an advertiser */
+nsdefine('BUSINESS_ROLE', 3);
+/** Default role of a bot user */
+nsdefine('BOT_ROLE', 4);
+/** ID of the group to which all Yioop users belong */
+nsdefine('PUBLIC_GROUP_ID', 2);
+/** ID of the group to which all Yioop users belong */
+nsdefine('PUBLIC_USER_ID', 2);
+/** ID of the group to which all Yioop Help Wiki articles belong */
+nsdefine('HELP_GROUP_ID', 3);
+/** Length of advertisement name string */
+nsdefine('ADVERTISEMENT_NAME_LEN', 25);
+/** Length of advertisement text description */
+nsdefine('ADVERTISEMENT_TEXT_LEN', 35);
+/** Length of advertisement keywords */
+nsdefine('ADVERTISEMENT_KEYWORD_LEN', 60);
+/** Length of advertisement date */
+nsdefine('ADVERTISEMENT_DATE_LEN', 20);
+/** Length of advertisement destination */
+nsdefine('ADVERTISEMENT_DESTINATION_LEN', 60);
+/** value used to create advertisement*/
+nsdefine('ADVERTISEMENT_ACTIVE_STATUS', 1);
+/** value used to stop advertisement campaign */
+nsdefine('ADVERTISEMENT_DEACTIVATED_STATUS',2);
+/** value used to admin suspend advertisement campaign */
+nsdefine('ADVERTISEMENT_SUSPENDED_STATUS',3);
+/** value used to indicate campaign completed successfully */
+nsdefine('ADVERTISEMENT_COMPLETED_STATUS',4);
+if (!PROFILE) {
+    return;
+}
+/*+++ End machine generated code, feel free to edit the below as desired +++*/
+/** this is the User-Agent names the crawler provides
+ * a web-server it is crawling
+ */
+nsconddefine('USER_AGENT',
+    'Mozilla/5.0 (compatible; '.USER_AGENT_SHORT.'; +'.NAME_SERVER.'bot.php)');
+/**
+ * To change the Open Search Tool bar name overrride the following variable
+ * in your local_config.php file
+ */
+nsconddefine('SEARCHBAR_PATH', NAME_SERVER . "yioopbar.xml");
+/**
+ * Phantom JS is used by some optional Javascript tests of the Yioop interface.
+ * The constant PHANTOM_JS should point to the path to phantomjs
+ */
+nsconddefine("PHANTOM_JS", "phantomjs");
+/** maximum size of a log file before it is rotated */
+nsconddefine("MAX_LOG_FILE_SIZE", 5000000);
+/** number of log files to rotate amongst */
+nsconddefine("NUMBER_OF_LOG_FILES", 5);
+/**
+ * how long in seconds to keep a cache of a robot.txt
+ * file before re-requesting it
+ */
+nsconddefine('CACHE_ROBOT_TXT_TIME', ONE_DAY);
+/**
+ * Whether the scheduler should track ETag and Expires headers.
+ * If you want to turn this off set the variable to false in
+ * local_config.php
+ */
+nsconddefine('USE_ETAG_EXPIRES', true);
+/**
+ * if the robots.txt has a Crawl-delay larger than this
+ * value don't crawl the site.
+ * maximum value for this is 255
+ */
+nsconddefine('MAXIMUM_CRAWL_DELAY', 64);
+/** maximum number of active crawl-delayed hosts */
+nsconddefine('MAX_WAITING_HOSTS', 250);
+/** Minimum weight in priority queue before rebuilt */
+nsconddefine('MIN_QUEUE_WEIGHT', 1/100000);
+/**  largest sized object allowed in a web archive (used to sanity check
+ *  reading data out of a web archive)
+ */
+nsconddefine('MAX_ARCHIVE_OBJECT_SIZE', 100000000);
+/** Treat earlier timestamps as being indexes of format version 0 */
+nsconddefine('VERSION_0_TIMESTAMP', 1369754208);
+defineMemoryProfile();
+/**
+ * Code to determine how much memory current machine has
+ */
+function defineMemoryProfile()
+{
+    //assume have at least 4GB on a Mac(could use vm_stat)
+    $memory = 4000000000;
+    if (strstr(PHP_OS, "WIN")) {
+        if (function_exists("exec")) {
+            exec('wmic memorychip get capacity', $memory_array);
+            $memory = array_sum($memory_array);
+        }
+    } else if (stristr(PHP_OS, "LINUX")) {
+        $data = preg_split("/\s+/", file_get_contents("/proc/meminfo"));
+        $memory = 1024 * intval($data[1]);
+    }
+    /**
+     * Factor to multiply sizes of Yioop data structures with in low ram memory
+     * setting (2GB)
+     */
+    nsdefine('MEMORY_LOW', 1);
+    /**
+     * Factor to multiply sizes of Yioop data structures with if have more than
+     * (2GB)
+     */
+    nsdefine('MEMORY_STANDARD', 4);
+    if ($memory < 2200000000) {
+        /**
+         * Based on system memory, either the low or high memory factor
+         */
+        nsdefine('MEMORY_PROFILE', MEMORY_LOW);
+    } else {
+        /**
+         * @ignore
+         */
+        nsdefine('MEMORY_PROFILE', MEMORY_STANDARD);
+    }
+    /**
+     * Delay in microseconds between processing pages to try to avoid
+     * CPU overheating. On some systems, you can set this to 0.
+     */
+    nsconddefine('FETCHER_PROCESS_DELAY', 10000);
+}
+
+/**
+ * bloom filters are used to keep track of which urls are visited,
+ * this parameter determines up to how many
+ * urls will be stored in a single filter. Additional filters are
+ * read to and from disk.
+ */
+nsconddefine('URL_FILTER_SIZE', MEMORY_PROFILE * 5000000);
+/**
+ * maximum number of urls that will be held in ram
+ * (as opposed to in files) in the priority queue
+ */
+nsconddefine('NUM_URLS_QUEUE_RAM', MEMORY_PROFILE * 80000);
+/** number of documents before next gen */
+nsconddefine('NUM_DOCS_PER_GENERATION', MEMORY_PROFILE * 10000);
+/** precision to round floating points document scores */
+nsconddefine('PRECISION', 10);
+/** maximum number of links to extract from a page on an initial pass*/
+nsconddefine('MAX_LINKS_TO_EXTRACT', MEMORY_PROFILE * 80);
+/** maximum number of links to keep after initial extraction*/
+nsconddefine('MAX_LINKS_PER_PAGE', 50);
+/** Estimate of the average number of links per page a document has*/
+nsconddefine('AVG_LINKS_PER_PAGE', 24);
+/** maximum number of links to consider from a sitemap page */
+nsconddefine('MAX_LINKS_PER_SITEMAP', MEMORY_PROFILE * 80);
+/**  maximum number of words from links to consider on any given page */
+nsconddefine('MAX_LINKS_WORD_TEXT', 100);
+/**  maximum length of urls to try to queue, this is important for
+ *  memory when creating schedule, since the amount of memory is
+ *  going to be greater than the product MAX_URL_LEN*MAX_FETCH_SIZE
+ *  text_processors need to promise to implement this check or rely
+ *  on the base class which does implement it in extractHttpHttpsUrls
+ */
+nsconddefine('MAX_URL_LEN', 512);
+/** request this many bytes out of a page -- this is the default value to
+ * use if the user doesn't set this value in the page options GUI
+ */
+nsdefine('PAGE_RANGE_REQUEST', 50000);
+/**
+ * When getting information from an index dictionary in word iterator
+ * how many distinct generations to read in in one go
+ */
+nsconddefine('NUM_DISTINCT_GENERATIONS', 20);
+/**
+ * Max number of chars to extract for description from a page to index.
+ * Only words in the description are indexed. -- this is the default value
+ * can be set in Page Options
+ */
+nsdefine('MAX_DESCRIPTION_LEN', 2000);
+/**
+ * Allow pages to be recrawled after this many days -- this is the
+ * default value to use if the user doesn't set this value in the page options
+ * GUI. What this controls is how often the page url filter is deleted.
+ * A nonpositive value means the filter will never be deleted.
+ */
+nsdefine('PAGE_RECRAWL_FREQUENCY', -1);
+/** number of multi curl page requests in one go */
+nsconddefine('NUM_MULTI_CURL_PAGES', 100);
+/** number of pages to extract from an archive in one go */
+nsconddefine('ARCHIVE_BATCH_SIZE', 100);
+/** time in seconds before we give up on multi page requests*/
+nsconddefine('PAGE_TIMEOUT', 30);
+/** time in seconds before we give up on a single page request*/
+nsconddefine('SINGLE_PAGE_TIMEOUT', ONE_MINUTE);
+/** max time in seconds in a process before write a log message if
+ *  crawlTimeoutLog is called repeatedly from a loop
+ */
+nsconddefine('LOG_TIMEOUT', 30);
+/** Number of lines of QueueServer log file to check to make sure both
+ *  Indexer and Scheduler are running. 6000 should be roughly 20-30 minutes
+ */
+nsconddefine('LOG_LINES_TO_RESTART', 6000);
+/** File name of file used to record last log lines when a Yioop process has
+ * crashed.
+ */
+nsconddefine('CRASH_LOG_NAME', LOG_DIR . "/YioopCrashes.log");
+/**
+ * Maximum time a crawl daemon process can go before calling
+ * @see CrawlDaemon::processHandler
+ */
+nsconddefine('PROCESS_TIMEOUT', 15 * ONE_MINUTE);
+/**
+ * Number of error page 400 or greater seen from a host before crawl-delay
+ * host and dump remainder from current schedule
+ */
+nsconddefine('DOWNLOAD_ERROR_THRESHOLD', 50);
+/** Crawl-delay to set in the event that DOWNLOAD_ERROR_THRESHOLD exceeded*/
+nsconddefine('ERROR_CRAWL_DELAY', 20);
+/**
+ * if FFMPEG defined, the maximum size of a uploaded video file which will
+ * be automatically transcode by Yioop to mp4 and webm
+ */
+nsconddefine("MAX_VIDEO_CONVERT_SIZE", 2000000000);
+/**
+ * The maximum time limit in seconds where if a file is not converted by the
+ * time it will be picked up again by the client media updater
+ * This value largely depends on the no of client media updaters that we have
+ * and also the maximum video size that would be uploaded to yioop.
+ * This value should be kept more than the sleeping time of media updater
+ * loop to avoid conversion of same file multiple times.
+ */
+nsconddefine('MAX_FILE_TIMESTAMP_LIMIT', 600);
+/**
+ * This mail timestamp limit allows mail server to create a new file
+ * and write next mailer batch in the new file. Otherwise, new mailer
+ * batch will be written in old file. For eg. new file will be created every
+ * 5 minutes as per below value.
+ */
+nsconddefine('MAX_MAIL_TIMESTAMP_LIMIT', 300);
+/**
+ * Default edge size of square image thumbnails in pixels
+ */
+nsconddefine('THUMB_DIM', 128);
+/**
+ * Maximum size of a user thumb file that can be uploaded
+ */
+nsconddefine('THUMB_SIZE', 1000000);
+/** Characters we view as not part of words, not same as POSIX [:punct:]*/
+nsconddefine('PUNCT', "\.|\,|\:|\;|\"|\'|\[|\/|\%|\?|-|" .
+    "\]|\{|\}|\(|\)|\!|\||\&|\`|" .
+    "\’|\‘|©|®|™|℠|…|\/|\>|，|\=|。|）|：|、|" .
+    "”|“|《|》|（|「|」|★|【|】|·|\+|\*|；".
+        "|！|—|―|？|！|،|؛|؞|؟|٪|٬|٭");
+/** Number of total description deemed title */
+nsconddefine('AD_HOC_TITLE_LENGTH', 50);
+/** Used to say number of bytes in histogram bar (stats page) for file
+    download sizes
+ */
+nsconddefine('DOWNLOAD_SIZE_INTERVAL', 5000);
+/** Used to say number of secs in histogram bar for file download times*/
+nsconddefine('DOWNLOAD_TIME_INTERVAL', 0.5);
+/**
+ * How many non robot urls the fetcher successfully downloads before
+ * between times data sent back to queue server
+ */
+nsconddefine('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', MEMORY_PROFILE * 95);
+/** maximum number of urls to schedule to a given fetcher in one go */
+nsconddefine('MAX_FETCH_SIZE', MEMORY_PROFILE * 1000);
+/** fetcher must wait at least this long between multi-curl requests */
+nsconddefine('MINIMUM_FETCH_LOOP_TIME', 5);
+/** an idling fetcher sleeps this long between queue_server pings*/
+nsconddefine('FETCH_SLEEP_TIME', 10);
+/** an a queue_server minimum loop idle time*/
+nsconddefine('QUEUE_SLEEP_TIME', 5);
+/** How often mirror script tries to synchronize with machine it is mirroring*/
+nsconddefine('MIRROR_SYNC_FREQUENCY', ONE_HOUR);
+/** How often mirror script tries to notify machine it is mirroring that it
+is still alive*/
+nsconddefine('MIRROR_NOTIFY_FREQUENCY', ONE_MINUTE);
+/** Max time before dirty index (queue_server) and
+    filters (fetcher) will be force saved in seconds*/
+nsconddefine('FORCE_SAVE_TIME', ONE_HOUR);
+/** Number of seconds of no fetcher contact before crawl is deemed dead */
+nsdefine("CRAWL_TIME_OUT", 1800);
+/** maximum lenght of a  search query */
+nsconddefine('MAX_QUERY_TERMS', 10);
+/** maximum number of terms allowed in a conjunctive search query */
+nsconddefine('MAX_QUERY_LEN', 4096);
+/** whether to use question answering system */
+nsconddefine('ENABLE_QUESTION_ANSWERING', true);
+/** Number of words until to switch from bag of words to phrase lookup */
+nsconddefine('PHRASE_THRESHOLD', 3);
+/** default number of search results to display per page */
+nsconddefine('NUM_RESULTS_PER_PAGE', 10);
+/** Number of recently crawled urls to display on admin screen */
+nsconddefine('NUM_RECENT_URLS_TO_DISPLAY', 10);
+/** Maximum time a set of results can stay in query cache before it is
+    invalidated. If negative, then never use time to kick something out of
+    cache. */
+nsconddefine('MAX_QUERY_CACHE_TIME', 2 * ONE_DAY); //two days
+/** Minimum time a set of results can stay in query cache before it is
+    invalidated (used for active crawl or feed results) */
+nsconddefine('MIN_QUERY_CACHE_TIME', ONE_HOUR); //one hour
+/**
+ * Default number of items to page through for users,roles, mixes, etc
+ * on the admin screens
+ */
+nsconddefine('DEFAULT_ADMIN_PAGING_NUM', 50);
+/** Maximum number of bytes that the file that the suggest-a-url form
+ * send data to can be.
+ */
+nsconddefine('MAX_SUGGEST_URL_FILE_SIZE', 100000);
+/** Maximum number of a user can suggest to the suggest-a-url form in one day
+ */
+nsconddefine('MAX_SUGGEST_URLS_ONE_DAY', 10);
+/** Directly add suggested urls to crawl options and inject them into any
+ *  active crawl. If false, these are stored in a file and the user has to
+ *  click a button to add them.
+ */
+nsconddefine('DIRECT_ADD_SUGGEST', false);
+/**
+ * Length after which to truncate names for users/groups/roles when
+ * they are displayed (not in DB)
+ */
+nsconddefine('NAME_TRUNCATE_LEN', 7);
+/** USER STATUS value used for someone who is not in a group but can browse*/
+nsdefine('NOT_MEMBER_STATUS', -1);
+/** USER STATUS value used for a user who can log in and perform activities */
+nsdefine('ACTIVE_STATUS', 1);
+/**
+ * USER STATUS value used for a user whose account is created, but which
+ * still needs to undergo admin or email verification/activation
+ */
+nsdefine('INACTIVE_STATUS', 2);
+/**
+ * USER STATUS used to indicate an account which can no longer perform
+ * activities but which might be retained to preserve old blog posts.
+ */
+nsdefine('SUSPENDED_STATUS', 3);
+/** Group status used to indicate a user that has been invited to join
+ * a group but who has not yet accepted
+ */
+nsdefine('INVITED_STATUS', 4);
+/**
+ * Group registration type that only allows people to join a group by
+ * invitation
+ */
+nsdefine('NO_JOIN', 1);
+/**
+ * Group registration type that only allows people to request a membership
+ * in a group from the group's owner
+ */
+nsdefine('REQUEST_JOIN', 2);
+/**
+ * Group registration type that only allows people to request a membership
+ * in a group from the group's owner, but allows people to browse the groups
+ * content without join
+ */
+nsdefine('PUBLIC_BROWSE_REQUEST_JOIN', 3);
+/**
+ * Group registration type that allows anyone to obtain membership
+ * in the group
+ */
+nsdefine('PUBLIC_JOIN', 4);
+/**
+ *  Group access code signifying only the group owner can
+ *  read items posted to the group or post new items
+ */
+nsdefine('GROUP_PRIVATE', 1);
+/**
+ *  Group access code signifying members of the group can
+ *  read items posted to the group but only the owner can post
+ *   new items
+ */
+nsdefine('GROUP_READ', 2);
+/**
+ *  Group access code signifying members of the group can
+ *  read items posted to the group but only the owner can post
+ *   new items
+ */
+nsdefine('GROUP_READ_COMMENT', 3);
+/**
+ *  Group access code signifying members of the group can both
+ *  read items posted to the group as well as post new items
+ */
+nsdefine('GROUP_READ_WRITE', 4);
+/**
+ *  Group access code signifying members of the group can both
+ *  read items posted to the group as well as post new items
+ *  and can edit the group's wiki
+ */
+nsdefine('GROUP_READ_WIKI', 5);
+/**
+ * Indicates a group where people can't up and down vote threads
+ */
+nsdefine("NON_VOTING_GROUP", 0);
+/**
+ * Indicates a group where people can vote up threads (but not down)
+ */
+nsdefine("UP_VOTING_GROUP", 1);
+/**
+ * Indicates a group where people can vote up and down threads
+ */
+nsdefine("UP_DOWN_VOTING_GROUP", 2);
+/**
+ *  Typical posts to a group feed are on user created threads and
+ *  so are of this type
+ */
+nsdefine('STANDARD_GROUP_ITEM', 0);
+/**
+ *  Indicates the thread was created to go alongside the creation of a wiki
+ *  page so that people can discuss the pages contents
+ */
+nsdefine('WIKI_GROUP_ITEM', 1);
+/**
+ *  Used to record that a page belongs to the template category
+ */
+nsdefine('WIKI_STANDARD_LINK', -1);
+/**
+ *  Used to record that a page belongs to the template category
+ */
+nsdefine('WIKI_TEMPLATE_LINK', -2);
+/**
+ *  set to true if Multiple news updaters are running
+ *  otherwise set to false if name server is running the news updater
+ */
+nsconddefine('SEND_MAIL_MEDIA_UPDATER', false);
+/**
+ * Impression type used to record one view of a thread
+ */
+nsdefine('THREAD_IMPRESSION', 1);
+/**
+ * Impression type used to record one view of a wiki page
+ */
+nsdefine('WIKI_IMPRESSION', 2);
+/**
+ * Impression type used to record one thread or wiki page view in a group
+ */
+nsdefine('GROUP_IMPRESSION', 3);
+/**
+ * Impression type used to record one search query view
+ */
+nsdefine('QUERY_IMPRESSION', 4);
+/**
+ * Used to control update frequency of impression analytic data when
+ * media updater in use
+ */
+nsconddefine("ANALYTICS_UPDATE_INTERVAL", ONE_HOUR / 6);
+/** Value of epsilon in differential privacy formula */
+nsconddefine('PRIVACY_EPSILON', 0.01);
+/** Flag to turn on/off search impression recording */
+nsconddefine('SEARCH_ANALYTICS', true);
+/** Flag to turn on/off group impression recording */
+nsconddefine('GROUP_ANALYTICS', true);
+/** Flag to turn on/off differential privacy */
+nsconddefine('DIFFERENTIAL_PRIVACY', false);
+/*
+ * Database Field Sizes
+ */
+/* Length for names of things like first name, last name, etc */
+nsdefine('NAME_LEN', 32);
+/** Used for lengths of media sources, passwords, and emails */
+nsdefine('LONG_NAME_LEN', 64);
+/** Length for names of things like group names, etc */
+nsdefine('SHORT_TITLE_LEN', 128);
+/** Length for names of things like titles of blog entries, etc */
+nsdefine('TITLE_LEN', 512);
+/** Length of a feed item or post, etc */
+nsdefine('MAX_GROUP_POST_LEN', 8192);
+/** Length for for the contents of a wiki_page */
+nsdefine('MAX_GROUP_PAGE_LEN', 524288);
+/** Length for base 64 encode timestamps */
+nsdefine('TIMESTAMP_LEN', 11);
+/** Length for timestamps down to microseconds */
+nsdefine('MICROSECOND_TIMESTAMP_LEN', 20);
+/** Length for a CAPTCHA */
+nsdefine('CAPTCHA_LEN', 6);
+/** Length for a number field */
+nsdefine('MAX_IP_ADDRESS_AS_STRING_LEN', 39);
+/** Length for a number field */
+nsdefine('NUM_FIELD_LEN', 4);
+/** Length for writing mode in locales */
+nsdefine('WRITING_MODE_LEN', 5);
+/** Length of zero knowledge password string */
+nsdefine('ZKP_PASSWORD_LEN', 200);
+/*
+ * Adjustable AD RELATED defines
+ *
+ /** Truncate length for ad description and keywords*/
+nsdefine('ADVERTISEMENT_TRUNCATE_LEN', 8);
+/** Initial bid amount for advertisement keyword */
+nsconddefine('AD_KEYWORD_INIT_BID',1);
+/** Allows the root account to purchase free ad credits. Might
+ *  mess up the value of credits if allow. This only makes a difference
+ *  in the presence of an ad processing script
+ */
+nsconddefine('ALLOW_FREE_ROOT_CREDIT_PURCHASE', false);
+/** advertisement date format for start date and end date*/
+nsconddefine('AD_DATE_FORMAT','Y-m-d');
+/** advertisement logo*/
+nsconddefine('AD_LOGO','resources/adv-logo.png');
+/** sentence compression enabled or not*/
+nsconddefine('SENTENCE_COMPRESSION_ENABLED', false);
--- a/src/configs/ConfigureTool.php
+++ b/src/configs/ConfigureTool.php
@ -0,0 +1,583 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Used to create and manipulate a profile and work directory from the
+ * command-line for Yioop.
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\configs;
+
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\controllers\AdminController;
+
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+/** Loads common utility functions*/
+require_once  __DIR__."/../library/Utility.php";
+/** Loads common constants for web crawling*/
+require_once __DIR__."/../library/LocaleFunctions.php";
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+/**
+ * shorthand for echo
+ *
+ * @param string $text string to send to the current output
+ */
+function e($text)
+{
+    echo $text;
+}
+$locale_tag = L\guessLocale();
+$locale = null;
+L\setLocaleObject($locale_tag);
+/**
+ * Provides a command-line interface way to configure a Yioop Instance.
+ * Unlike the web interface this interface is English-only.
+ */
+class ConfigureTool
+{
+    /**
+     * Used to hold an AdminController object used to manipulate the
+     * Yioop configuration
+     * @var object
+     */
+    public $admin;
+    /**
+     * Holds the main menu data for the configuration tool
+     * @var array
+     */
+    public $menu = ["workDirectory" => "Create/Set Work Directory",
+        "rootPassword" => "Change root password",
+        "defaultLocale"=> "Set Default Locale",
+        "debugDisplay"=> "Debug Display Set-up",
+        "searchAccess"=> "Search Access Set-up",
+        "searchPageElementLinks" => "Search Page Elements and Links",
+        "nameServer" => "Name Server Set-up",
+        "robotSetUp"=> "Crawl Robot Set-up",
+        "quit" => "Exit program"
+    ];
+    /**
+     * To change configuration parameters of Yioop, this program
+     * invokes AdminController methods. These methods expect, data
+     * passed to them in super globals set up as a result of an HTTP
+     * request. This program fakes the settings of these variables.
+     * To keep things simple this constructor initializes each of the
+     * relevant super globals to be empty arrays.
+     */
+    function __construct()
+    {
+        $_REQUEST = [];
+        $_POST = [];
+        $_GET = [];
+        $_SERVER = [];
+        $_SESSION = [];
+        $this->admin = new AdminController();
+    }
+    /**
+     * This is the main loop where options of what the user can configure
+     * are presented, a choice is requested, and so on...
+     */
+    function loop()
+    {
+        $done = false;
+        $activities = array_keys($this->menu);
+        $activities[] = "configureMenu";
+        $state = "configureMenu";
+        while($state != "quit") {
+            if (in_array($state, $activities) ) {
+                $state = $this->$state();
+            }
+        }
+    }
+    /**
+     * This is used to draw the main configuration menu and ask for a
+     * user selection
+     */
+    function configureMenu()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        e("Checking Yioop configuration...".
+            "\n===============================\n");
+        $check_status = str_replace("<br />", "\n", $data["SYSTEM_CHECK"]);
+        e($check_status."\n===============================\n");
+
+        $items = ["workDirectory" => "Create/Set Work Directory",
+            "quit" => "Exit program"];
+        if ($data["PROFILE"]) {
+            $items = $this->menu;
+        }
+        return $this->drawChooseItems($items, "configureMenu");
+    }
+    /**
+     * Used to create/change the location of this Yioop instances work
+     * directory
+     */
+    function workDirectory()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        $directory = (isset($data["WORK_DIRECTORY"]) &&
+            $data["WORK_DIRECTORY"] != "") ? $data["WORK_DIRECTORY"]
+            : "No value set yet.";
+        e("CURRENT WORK DIRECTORY: $directory\n\n");
+        e("Enter a new value:\n");
+        if (!isset($_SERVER['REQUEST_URI'])) {
+            $_SERVER['REQUEST_URI'] = "";
+        }
+        $this->prepareGlobals($data);
+        $_REQUEST["WORK_DIRECTORY"] = L\readInput();
+        $_REQUEST["arg"] = "directory";
+        $next_menu = $this->confirmChange("configure", "workDirectory");
+        return $next_menu;
+    }
+    /**
+     * Used to change the password of the root account of this Yioop Instance
+     */
+    function rootPassword()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        if ($data["PROFILE"] != 1) {
+            $_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
+            return "configureMenu";
+        }
+        e("Enter old password:");
+        $_REQUEST["password"] = L\readPassword();
+        e("Enter new password:");
+        $_REQUEST["new_password"] = L\readPassword();
+        e("Re-Enter new password:");
+        $_REQUEST["retype_password"] = L\readPassword();
+        $_SESSION['USER_ID'] = ROOT_ID;
+        $_REQUEST['arg'] = "updateuser";
+        $_REQUEST['edit_pass'] = "true";
+        $next_menu = $this->confirmChange("manageAccount", "rootPassword");
+        return $next_menu;
+    }
+    /**
+     * Changes the default locale (language) used by Yioop when it cannot
+     * determine that information from the users browswer
+     */
+    function defaultLocale()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        if ($data["PROFILE"] != 1) {
+            $_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
+            return "configureMenu";
+        }
+        e("CURRENT LANGUAGE: ".$data["LANGUAGES"][
+            $data["DEFAULT_LOCALE"]]."\n\n");
+        $_SESSION = [];
+        $items = $data["LANGUAGES"];
+        $items["configureMenu"] = "Return to Main Menu";
+
+        do {
+            $choice = $this->drawChooseItems($items, "defaultLocale");
+        } while( $choice == "defaultLocale");
+
+        $this->prepareGlobals($data);
+        if ($choice == "configureMenu") {
+            $_REQUEST = [];
+            $_SERVER = [];
+            return "configureMenu";
+        }
+        $_REQUEST["DEFAULT_LOCALE"] = $choice;
+        return "defaultLocale";
+    }
+    /**
+     * Used to configure debugging information for this Yioop instance.
+     * i.e., whether PHP notices, warnings, errors, should be displayed,
+     * whether query statistics and info should be displayed, and whether
+     * unit tests should be viewable from the web
+     */
+    function debugDisplay()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        if ($data["PROFILE"] != 1) {
+            $_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
+            return "configureMenu";
+        }
+        e("CURRENT DEBUG SETTINGS\n======================\n");
+        $dlevel = $data["DEBUG_LEVEL"];
+        $setting = ($dlevel & ERROR_INFO) ? "On" : "Off";
+        e("Error Info: [$setting]\n");
+        $setting = ($dlevel & QUERY_INFO) ? "On" : "Off";
+        e("Query Info: [$setting]\n");
+        $setting = ($dlevel & TEST_INFO) ? "On" : "Off";
+        e("Test Info: [$setting]\n");
+        $items = ["ERROR_INFO" => "Toggle Error Info",
+            "QUERY_INFO" => "Toggle Query Info",
+            "TEST_INFO" => "Toggle Test Info",
+            "configureMenu" => "Return to Main Menu"];
+        do {
+            $choice = $this->drawChooseItems($items, "debugDisplay");
+        } while( $choice == "debugDisplay");
+        $this->prepareGlobals($data);
+        if ($choice == "configureMenu") {
+            $_REQUEST = [];
+            $_SERVER = [];
+            return "configureMenu";
+        }
+        $flag = constant($choice);
+        $dlevel = ($dlevel & $flag) ? $dlevel - $flag : $dlevel + $flag;
+        if ($dlevel & ERROR_INFO) {$_REQUEST["ERROR_INFO"] = true;}
+        if ($dlevel & QUERY_INFO) {$_REQUEST["QUERY_INFO"] = true;}
+        if ($dlevel & TEST_INFO) {$_REQUEST["TEST_INFO"] = true;}
+        return "debugDisplay";
+    }
+    /**
+     * Configures which methods are allowed by this Yioop instance to access
+     * search results, (via the web, via open rss search results, via the
+     * API)
+     */
+    function searchAccess()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        if ($data["PROFILE"] != 1) {
+            $_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
+            return "configureMenu";
+        }
+        e("CURRENT SEARCH ACCESS SETTINGS\n==============================\n");
+        $settings = ["WEB_ACCESS" => "Web",
+            "RSS_ACCESS" => "RSS", "API_ACCESS" => "API"];
+        $items = [];
+        foreach ($settings as $setting => $setting_string) {
+            $toggle = ($data[$setting]) ? "On" : "Off";
+            e("$setting_string: [$toggle]\n");
+            $items[$setting] = "Toggle $setting_string";
+        }
+        $items["configureMenu"] = "Return to Main Menu";
+        do {
+            $choice = $this->drawChooseItems($items, "searchAccess");
+        } while( $choice == "searchAccess");
+        $this->prepareGlobals($data);
+        if ($choice == "configureMenu") {
+            $_REQUEST = [];
+            $_SERVER = [];
+            return "configureMenu";
+        }
+        $_REQUEST[$choice] = ($data[$choice]) ? false : true;
+        return "searchAccess";
+    }
+    /**
+     * Configures which of the various links of the SERPS page such as
+     * Cache, etc should be displayed. Also, configures whether the signin
+     * links, etc should be displayed.
+     */
+    function searchPageElementLinks()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        if ($data["PROFILE"] != 1) {
+            $_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
+            return "configureMenu";
+        }
+        e("CURRENT SEARCH PAGE ELEMENTS AND LINKS SETTINGS".
+            "\n===================================================\n");
+        $settings = ["WORD_SUGGEST" => "Word Suggest",
+            "SUBSEARCH_LINK"  => "Subsearch Links",
+            "SIGNIN_LINK" => "Sign-in Links", "CACHE_LINK" => "Cache Link",
+            "SIMILAR_LINK" => "Similar Link", "IN_LINK" => "Inlinks",
+            "IP_LINK"=> "IP Links"];
+        $items = [];
+        foreach ($settings as $setting => $setting_string) {
+            $toggle = ($data[$setting]) ? "On" : "Off";
+            e("$setting_string: [$toggle]\n");
+            $items[$setting] = "Toggle $setting_string";
+        }
+        $items["configureMenu"] = "Return to Main Menu";
+        do {
+            $choice = $this->drawChooseItems($items, "searchPageElementLinks");
+        } while( $choice == "searchPageElementLinks");
+        $this->prepareGlobals($data);
+        if ($choice == "configureMenu") {
+            $_REQUEST = [];
+            $_SERVER = [];
+            return "configureMenu";
+        }
+        $_REQUEST[$choice] = ($data[$choice]) ? false : true;
+        return "searchPageElementLinks";
+    }
+    /**
+     * Configures settings relating to the location of the name server and
+     * the salt used when communicating with it. Also, configures caching
+     * mechanisms the name server should use when returning results.
+     */
+    function nameServer()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        if ($data["PROFILE"] != 1) {
+            $_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
+            return "configureMenu";
+        }
+        e("NAME SERVER SETTINGS\n====================\n");
+        e("Server Key: [".$data["AUTH_KEY"]."]\n");
+        e("Name Server URL: [".$data["NAME_SERVER"]."]\n");
+        $settings = ["USE_FILECACHE" => "Use File Cache",
+            "USE_MEMCACHE" => "Use Memcache"];
+        $items = ["serverKey" => "Edit Server Key",
+            "nameServer" => "Edit Name Server Url"];
+        foreach ($settings as $setting => $setting_string) {
+            $toggle = ($data[$setting]) ? "On" : "Off";
+            e("$setting_string: [$toggle]\n");
+            $items[$setting] = "Toggle $setting_string";
+        }
+        e("\nMemcache Servers:\n=================\n".$data["MEMCACHE_SERVERS"].
+            "\n=================\n");
+        $items["memcacheServers"] = "Edit Memcache Servers";
+        $items["configureMenu"] = "Return to Main Menu";
+        do {
+            $choice = $this->drawChooseItems($items, "nameServerMenu");
+        } while( $choice == "nameServerMenu");
+        $this->prepareGlobals($data);
+        switch ($choice) {
+            case "configureMenu":
+                $_REQUEST = [];
+                $_SERVER = [];
+                return "configureMenu";
+                break;
+            case "serverKey":
+                e("Enter a new server key: ");
+                $_REQUEST["AUTH_KEY"] = L\readInput();
+                break;
+            case "nameServer":
+                e("Enter a new name server url: ");
+                $_REQUEST["NAME_SERVER"] = L\readInput();
+                break;
+            case "memcacheServers":
+                e("Enter memcache servers, one per line.\n".
+                  "Terminate input with a line with only '.' on it:\n");
+                $_REQUEST["MEMCACHE_SERVERS"] = L\readMessage();
+                break;
+            default:
+                $_REQUEST[$choice] = ($data[$choice]) ? false : true;
+        }
+        return "nameServer";
+    }
+    /**
+     * Used to set up the name of this instance of the Yioop robot as well
+     * as its description page.
+     */
+    function robotSetUp()
+    {
+        $this->banner();
+        $data = $this->callConfigure();
+        if ($data["PROFILE"] != 1) {
+            $_REQUEST["MESSAGE"] = "Work directory needs to be set/created!";
+            return "configureMenu";
+        }
+        e("CRAWL ROBOT SETTINGS\n====================\n");
+        e("Crawl Robot Name: [".$data["USER_AGENT_SHORT"]."]\n");
+        e("Robot Instance: [".$data["ROBOT_INSTANCE"]."]\n");
+        e("\nRobot Description:\n=================\n".
+            $data["ROBOT_DESCRIPTION"] . "\n=================\n");
+        $items = ["robotName" => "Edit Robot Name",
+            "robotInstance" => "Edit Robot Instance",
+            "robotDescription" => "Edit Robot Description",
+            "configureMenu" => "Return to Main Menu"];
+        do {
+            $choice = $this->drawChooseItems($items, "robotSetUp");
+        } while( $choice == "robotSetUp");
+        $this->prepareGlobals($data);
+        switch ($choice) {
+            case "configureMenu":
+                $_REQUEST = [];
+                $_SERVER = [];
+                return "configureMenu";
+                break;
+            case "robotName":
+                e("Enter a new robot name: ");
+                $_REQUEST["USER_AGENT_SHORT"] = L\readInput();
+                break;
+            case "robotInstance":
+                e("Enter a new robot instance value: ");
+                $_REQUEST["ROBOT_INSTANCE"] = L\readInput();
+                break;
+            case "robotDescription":
+                e("Enter a description of your web crawler robot.\n".
+                  "Terminate input with a line with only '.' on it:\n");
+                $_REQUEST["ROBOT_DESCRIPTION"] = L\readMessage();
+                break;
+        }
+        return "robotSetUp";
+    }
+    /**
+     * Used to select to confirm, cancel, or re-enter the last profile
+     * change
+     *
+     * @param string $admin_method to call if confirmed
+     * @param string $reenter_method , return value if reenter chosen
+     * @return string menu name to do to next
+     */
+    function confirmChange($admin_method, $reenter_method)
+    {
+        $component_activities = AdminController::$component_activities;
+        $items = ["confirm" => "Confirm Change",
+            "reenter" => "Re-enter the information",
+            "configureMenu" => "Return to the Configure Menu"];
+        $first = true;
+        do {
+            $choice = $this->drawChooseItems($items, "confirmChange");
+        } while( $choice == "confirmChange");
+        switch ($choice) {
+            case "confirm":
+                $component = "system";
+                foreach ($component_activities as $available_component =>
+                    $activities) {
+                    if (in_array($admin_method, $activities)) {
+                        $component = $available_component;
+                        break;
+                    }
+                }
+                $data = $this->admin->component($component)->$admin_method();
+                $_SERVER = [];
+                $_SESSION = [];
+                $_REQUEST = [];
+                $_REQUEST["MESSAGE"] = $data["MESSAGE"];
+                $next_menu = "configureMenu";
+                break;
+            case "reenter":
+                $_SERVER = [];
+                $_SESSION = [];
+                $_REQUEST = [];
+                $next_menu = $reenter_method;
+                break;
+            default:
+                $_SERVER = [];
+                $_SESSION = [];
+                $_REQUEST = [];
+                $next_menu = "configureMenu";
+        }
+        return $next_menu;
+    }
+    /**
+     * Draws a list of options to the screen and gets a choice
+     * from this list from the user.
+     *
+     * @param array $items as associative array (return value => description)
+     * @param string $currentView value to return if invalid choice made
+     * @return string a choice from the user
+     */
+    function drawChooseItems($items, $currentView)
+    {
+        $choice_nums = [];
+        $i = 1;
+        e("\nAvailable Options:\n==================\n");
+        foreach ($items as $name => $description) {
+            e("($i) $description\n");
+            $choice_nums[$i] = $name;
+            $i++;
+        }
+        if (!empty($_REQUEST["MESSAGE"])) {
+            e("\n+++ ".$_REQUEST["MESSAGE"]." +++\n");
+            unset($_REQUEST["MESSAGE"]);
+        }
+        e("\nPlease choose an option:\n");
+        $user_data = strtolower(trim(L\readInput()));
+
+        if ($user_data >= 1 && $user_data < $i) {
+            $_REQUEST["MESSAGE"] = "";
+            return $choice_nums[$user_data];
+        } else {
+            $_REQUEST["MESSAGE"] = "Invalid choice. Please choose again.";
+            return $currentView;
+        }
+    }
+    /**
+     * Used to call system components configure method. It detects if
+     * a redirect happened by the fact that $data['PROFILE'] is not set.
+     * If so it passes along the redirect message and re-calls configure()
+     */
+    function callConfigure()
+    {
+        $data = $this->admin->component("system")->configure();
+        if (!isset($data["PROFILE"])) {
+            $_REQUEST = [];
+            $message = (isset($data['MESSAGE'])) ? $data['MESSAGE'] : "";
+            $data = $this->admin->component("system")->configure();
+            $data['MESSAGE'] = $message;
+        }
+        return $data;
+    }
+    /**
+     * Prints the banner used by this configuration tool
+     */
+    function banner()
+    {
+        e(chr(27) . "[2J" . chr(27) . "[;H");
+        e("\n\nYIOOP! CONFIGURATION TOOL\n");
+        e("+++++++++++++++++++++++++\n\n");
+    }
+    /**
+     * Sets-up the field values of the super globals used by AdminController
+     * when changing a profile or managing passwords. These particular
+     * values don't change with respect to what this tool does.
+     *
+     * @param array $data current profile state
+     */
+    function prepareGlobals($data)
+    {
+        $_SESSION = [];
+        $_REQUEST = $this->copyProfileFields($data);
+        $_REQUEST["arg"] = "profile";
+        $_REQUEST['YIOOP_TOKEN'] = "";
+        if (!isset($_SERVER['REQUEST_URI'])) {
+            if (!empty($data['WEB_URI'])) {
+                $_SERVER['REQUEST_URI'] = $data['WEB_URI'];
+            } else {
+                e("Enter web path for Yioop instance:\n");
+                $_SERVER['REQUEST_URI'] = L\readInput();
+            }
+        }
+    }
+    /**
+     * Used to copy the contents of $data which are profile fields to a
+     * new array.
+     *
+     * @param array $data an array of profile and other fields
+     * @return array a new array containing a copy of just the profile fields
+     *     from the orginal array
+     */
+    function copyProfileFields($data)
+    {
+        $profile = [];
+        foreach ($this->admin->model("profile")->profile_fields as $field) {
+            if (isset($data[$field])) {
+                $profile[$field] = $data[$field];
+            }
+        }
+        return $profile;
+    }
+}
+$configure_tool = new ConfigureTool();
+$configure_tool->loop();
--- a/src/configs/Createdb.php
+++ b/src/configs/Createdb.php
@ -0,0 +1,546 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * This script can be used to set up the database and filesystem for the
+ * seekquarry database system. The SeekQuarry system is deployed with a
+ * minimal sqlite database so this script is not strictly needed.
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\configs;
+
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\models\Model;
+use seekquarry\yioop\models\ProfileModel;
+use seekquarry\yioop\models\GroupModel;
+
+if (!empty($_SERVER['DOCUMENT_ROOT'])) {
+    echo "BAD REQUEST";
+    exit();
+}
+/** For crawlHash function */
+require_once __DIR__."/../library/Utility.php";
+/** For wiki page translation stuff */
+require_once __DIR__."/../library/LocaleFunctions.php";
+/** To make it easy to insert translations */
+require_once __DIR__."/../library/UpgradeFunctions.php";
+$profile_model = new ProfileModel(DB_NAME, false);
+$db_class = NS_DATASOURCES . ucfirst(DBMS)."Manager";
+$dbinfo = ["DBMS" => DBMS, "DB_HOST" => DB_HOST, "DB_USER" => DB_USER,
+    "DB_PASSWORD" => DB_PASSWORD, "DB_NAME" => DB_NAME];
+if (!in_array(DBMS, ['sqlite', 'sqlite3'])) {
+    $db = new $db_class();
+    $db->connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME);
+    /*  postgres doesn't let you drop a database while connected to it so drop
+        tables instead first
+     */
+    $profile_model->initializeSql($db, $dbinfo);
+    $database_tables = array_keys($profile_model->create_statements);
+    foreach ($database_tables as $table) {
+        $db->execute("DROP TABLE ".$table);
+    }
+    $db->execute("DROP DATABASE IF EXISTS " . DB_NAME);
+    $db->execute("CREATE DATABASE " . DB_NAME);
+    $db->disconnect();
+    $db->connect(); // default connection goes to actual DB
+} else {
+    @unlink(CRAWL_DIR."/data/" . DB_NAME . ".db");
+    $db = new $db_class();
+    $db->connect();
+}
+if (!$profile_model->createDatabaseTables($db, $dbinfo)) {
+    echo "\n\nCouldn't create database tables!!!\n\n";
+    exit();
+}
+$db->execute("INSERT INTO VERSION VALUES (" . YIOOP_VERSION . ")");
+$creation_time = L\microTimestamp();
+//numerical value of the blank password
+$profile = $profile_model->getProfile(WORK_DIRECTORY);
+$new_profile = $profile;
+$new_profile['FIAT_SHAMIR_MODULUS'] = L\generateFiatShamirModulus();
+$profile_model->updateProfile(WORK_DIRECTORY, $new_profile, $profile);
+if ($new_profile['FIAT_SHAMIR_MODULUS']) {
+    $sha1_of_blank_string =  L\bchexdec(sha1(''));
+    //calculating V  = S ^ 2 mod N
+    $temp = bcpow($sha1_of_blank_string . '', '2');
+    $zkp_password = ($new_profile['FIAT_SHAMIR_MODULUS']) ?
+        bcmod($temp, $new_profile['FIAT_SHAMIR_MODULUS']) : "";
+} else {
+    $sha1_of_blank_string = "";
+    $zkp_password = "";
+}
+//default account is root without a password
+$sql ="INSERT INTO USERS VALUES (" . ROOT_ID . ", 'admin', 'admin','" .
+        ROOT_USERNAME . "',
+        'root@dev.null', '".L\crawlCrypt('')."', '".ACTIVE_STATUS.
+        "', '".L\crawlCrypt(ROOT_USERNAME . AUTH_KEY . $creation_time).
+        "', 0,'$creation_time', 0, 0, '$zkp_password')";
+$db->execute($sql);
+/* public account is an inactive account for used for public permissions
+   default account is root without a password
+ */
+$sql ="INSERT INTO USERS VALUES (".PUBLIC_USER_ID.", 'all', 'all','public',
+        'public@dev.null', '".L\crawlCrypt('')."', '".INACTIVE_STATUS.
+        "', '".L\crawlCrypt('public' . AUTH_KEY . $creation_time)."', 0,
+        '$creation_time', 0, 0, '$zkp_password')";
+$db->execute($sql);
+//default public group with group id 1
+$creation_time = L\microTimestamp();
+$sql = "INSERT INTO GROUPS VALUES(".PUBLIC_GROUP_ID.",'Public','".
+    $creation_time."','".ROOT_ID."', '".PUBLIC_JOIN."', '".GROUP_READ.
+    "', ".NON_VOTING_GROUP.", " . FOREVER . ")";
+$db->execute($sql);
+$now = time();
+$db->execute("INSERT INTO ROLE VALUES (".ADMIN_ROLE.", 'Admin' )");
+$db->execute("INSERT INTO ROLE VALUES (".USER_ROLE.", 'User' )");
+$db->execute("INSERT INTO ROLE VALUES (".BUSINESS_ROLE.", 'Business User' )");
+$db->execute("INSERT INTO ROLE VALUES (".BOT_ROLE.", 'Bot User' )");
+$db->execute("INSERT INTO USER_ROLE VALUES (".ROOT_ID.", ".ADMIN_ROLE.")");
+$db->execute("INSERT INTO USER_GROUP VALUES (".ROOT_ID.", ".
+    PUBLIC_GROUP_ID.", ".ACTIVE_STATUS.", $now)");
+$db->execute("INSERT INTO USER_GROUP VALUES (".PUBLIC_USER_ID.", ".
+    PUBLIC_GROUP_ID.", ".ACTIVE_STATUS.", $now)");
+//Create a Group for Wiki HELP.
+$sql = "INSERT INTO GROUPS VALUES(" . HELP_GROUP_ID . ",'Help','" .
+    $creation_time . "','" . ROOT_ID . "',
+    '" . PUBLIC_BROWSE_REQUEST_JOIN . "', '" . GROUP_READ_WIKI .
+    "', " . UP_DOWN_VOTING_GROUP . ", " . FOREVER . ")";
+$db->execute($sql);
+$now = time();
+$db->execute("INSERT INTO USER_GROUP VALUES (" . ROOT_ID . ", " .
+    HELP_GROUP_ID . ", " . ACTIVE_STATUS . ", $now)");
+$db->execute("INSERT INTO USER_GROUP VALUES (" . PUBLIC_USER_ID . ", " .
+    HELP_GROUP_ID . ", " . ACTIVE_STATUS . ", $now)");
+$group_model = new GroupModel(DB_NAME, false);
+$group_model->db = $db;
+// Insert Default Public Wiki Pages
+if (file_exists(APP_DIR . "/configs/PublicHelpPages.php")) {
+    require_once APP_DIR."/configs/PublicHelpPages.php";
+} else {
+    require_once BASE_DIR."/configs/PublicHelpPages.php";
+}
+$default_locale = L\getLocaleTag();
+foreach ($public_pages as $locale_tag => $locale_pages) {
+    L\setLocaleObject($locale_tag);
+    foreach ($locale_pages as $page_name => $page_content) {
+        $page_name = str_replace(" ", "_", $page_name);
+        $page_content = str_replace("'", "&#039;", $page_content);
+        $group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, $page_name,
+            $page_content, $locale_tag, "",
+            L\tl('social_component_page_created', $page_name),
+            L\tl('social_component_page_discuss_here'));
+    }
+}
+//Insert Default Public Help pages
+foreach ($help_pages as $locale_tag => $locale_pages) {
+    L\setLocaleObject($locale_tag);
+    foreach ($locale_pages as $page_name => $page_content) {
+        $page_name = str_replace(" ", "_", $page_name);
+        $page_content = str_replace("'", "&#039;", $page_content);
+        $group_model->setPageName(ROOT_ID, HELP_GROUP_ID, $page_name,
+            $page_content, $locale_tag, "",
+            L\tl('social_component_page_created', $page_name),
+            L\tl('social_component_page_discuss_here'));
+    }
+}
+L\setLocaleObject($default_locale);
+/* End Help content insertion. */
+/*
+   Set up generic page relationship
+ */
+$db->execute("INSERT INTO PAGE_RELATIONSHIP VALUES (-1, 'generic_links')");
+/* we insert 1 by 1 rather than comma separate as sqlite
+   does not support comma separated inserts
+ */
+$locales = [
+    ['en-US', 'English', 'lr-tb'],
+    ['ar', 'العربية', 'rl-tb'],
+    ['de', 'Deutsch', 'lr-tb'],
+    ['es', 'Español', 'lr-tb'],
+    ['fr-FR', 'Français', 'lr-tb'],
+    ['he', 'עברית', 'rl-tb'],
+    ['in-ID', 'Bahasa', 'lr-tb'],
+    ['it', 'Italiano', 'lr-tb'],
+    ['ja', '日本語', 'lr-tb'],
+    ['ko', '한국어', 'lr-tb'],
+    ['nl', 'Nederlands', 'lr-tb'],
+    ['pl', 'Polski', 'lr-tb'],
+    ['pt', 'Português', 'lr-tb'],
+    ['ru', 'Русский', 'lr-tb'],
+    ['th', 'ไทย', 'lr-tb'],
+    ['vi-VN', 'Tiếng Việt', 'lr-tb'],
+    ['zh-CN', '中文', 'lr-tb'],
+    ['kn', 'ಕನ್ನಡ', 'lr-tb'],
+    ['hi', 'हिन्दी', 'lr-tb'],
+    ['tr', 'Türkçe', 'lr-tb'],
+    ['fa', 'فارسی', 'rl-tb'],
+    ['te', 'తెలుగు', 'lr-tb'],
+];
+$i = 1;
+foreach ($locales as $locale) {
+    $db->execute("INSERT INTO LOCALE VALUES ($i, '{$locale[0]}',
+        '{$locale[1]}', '{$locale[2]}', '1')");
+    $locale_index[$locale[0]] = $i;
+    $i++;
+}
+$activities = [
+    "manageAccount" => ['db_activity_manage_account',
+        [
+            "en-US" => 'Manage Account',
+            "fa" => 'مدیریت حساب',
+            "fr-FR" => 'Modifier votre compte',
+            "ja" => 'アカウント管理',
+            "ko" => '사용자 계정 관리',
+            "nl" => 'Account Beheren',
+            "vi-VN" => 'Quản lý tài khoản',
+            "zh-CN" => '管理帳號',
+        ]],
+    "manageUsers" => ['db_activity_manage_users',
+        [
+            "en-US" => 'Manage Users',
+            "fa" => 'مدیریت کاربران',
+            "fr-FR" => 'Modifier les utilisateurs',
+            "ja" => 'ユーザー管理',
+            "ko" => '사용자 관리',
+            "nl" => 'Gebruikers beheren',
+            "vi-VN" => 'Quản lý tên sử dụng',
+            "zh-CN" => '管理使用者',
+        ]],
+    "manageRoles" => ['db_activity_manage_roles',
+        [
+            "en-US" => 'Manage Roles',
+            "fa" => 'مدیریت نقش‌ها',
+            "fr-FR" => 'Modifier les rôles',
+            "ja" => '役割管理',
+            "ko" => '사용자 권한 관리',
+            "nl" => 'Rollen beheren',
+            "vi-VN" => 'Quản lý chức vụ',
+        ]],
+    "manageGroups" => ['db_activity_manage_groups',
+        [
+            "en-US" => 'Manage Groups',
+            "fr-FR" => 'Modifier les groupes',
+            "nl" => 'Groepen beheren',
+        ]],
+    "manageCrawls" => ['db_activity_manage_crawl',
+        [
+            "en-US" => 'Manage Crawls',
+            "fa" => 'مدیریت خزش‌ها',
+            "fr-FR" => 'Modifier les indexes',
+            "ja" => '検索管理',
+            "ko" => '크롤 관리',
+            "nl" => 'Beheer Crawls',
+            "vi-VN" => 'Quản lý sự bò',
+        ]],
+    "groupFeeds" => ['db_activity_group_feeds',
+        [
+            "en-US" => 'Feeds and Wikis',
+            "nl" => 'Feeds en Wikis',
+        ]],
+    "mixCrawls" => ['db_activity_mix_crawls',
+        [
+            "en-US" => 'Mix Crawls',
+            "fa" => 'ترکیب‌های خزش‌ها',
+            "fr-FR" => 'Mélanger les indexes',
+            "nl" => 'Mix Crawls',
+        ]],
+    "manageClassifiers" => ['db_activity_manage_classifiers',
+        [
+            "en-US" => 'Manage Classifiers',
+            "fa" => '',
+            "fr-FR" => 'Classificateurs',
+            "nl" => 'Beheer Classifiers',
+        ]],
+    "pageOptions" => ['db_activity_file_options',
+        [
+            "en-US" => 'Page Options',
+            "fa" => 'تنظیمات صفحه',
+            "fr-FR" => 'Options de fichier',
+            "nl" => 'Opties voor de pagina',
+        ]],
+    "resultsEditor" => ['db_activity_results_editor',
+        [
+            "en-US" => 'Results Editor',
+            "fa" => 'ویرایشگر نتایج',
+            "fr-FR" => 'Éditeur de résultats',
+            "nl" => 'Resultaten Editor',
+        ]],
+    "searchSources" => ['db_activity_search_services',
+        [
+            "en-US" => 'Search Sources',
+            "fa" => 'منابع جستجو',
+            "fr-FR" => 'Sources de recherche',
+            "nl" => 'Zoek Bronnen',
+        ]],
+    "manageMachines" => ['db_activity_manage_machines',
+        [
+            "en-US" => 'Manage Machines',
+            "fa" => 'مدیریت دستگاه‌ها',
+            "fr-FR" => 'Modifier les ordinateurs',
+            "nl" => 'Beheer Machines',
+        ]],
+    "manageLocales" => ['db_activity_manage_locales',
+        [
+            "en-US" => 'Manage Locales',
+            "fa" => 'مدیریت زبان‌ها',
+            "fr-FR" => 'Modifier les lieux',
+            "ja" => 'ローケル管理',
+            "ko" => '로케일 관리',
+            "nl" => 'Beheer varianten',
+            "vi-VN" => 'Quản lý miền địa phương',
+        ]],
+    "serverSettings" => ['db_activity_server_settings',
+        [
+            "en-US" => 'Server Settings',
+            "fr-FR" => 'Serveurs',
+            "nl" => 'Server Settings',
+        ]],
+    "security" => ['db_activity_security',
+        [
+            "en-US" => 'Security',
+            "fr-FR" => 'Sécurité',
+            "nl" => 'Veiligheid',
+        ]],
+    "appearance" => ['db_activity_appearance',
+        [
+            "en-US" => 'Appearance',
+            "fr-FR" => 'Aspect',
+            "nl" => 'Verschijning',
+        ]],
+    "configure" => ['db_activity_configure',
+        [
+            "en-US" => 'Configure',
+            "fa" => 'پیکربندی',
+            "fr-FR" => 'Configurer',
+            "ja" => '設定',
+            "ko" => '구성',
+            "nl" => 'Configureren',
+            "vi-VN" => 'Sắp xếp hoạt động dựa theo hoạch định',
+        ]],
+    "manageCredits" => ['db_activity_manage_credits',
+        [
+            "en-US" => 'Manage Credits',
+        ]],
+    "manageAdvertisements" => ['db_activity_manage_advertisements',
+        [
+            "en-US" => 'Manage Advertisements',
+        ]],
+    "scrapers" => ['db_activity_scrapers',
+        [
+            "en-US" => 'Web Scrapers',
+        ]]
+];
+$i = 1;
+foreach ($activities as $activity => $translation_info) {
+    // set-up activity
+    $db->execute("INSERT INTO ACTIVITY VALUES ($i, $i, '$activity')");
+    //give admin role the ability to have that activity (except ads)
+    if (in_array($activity, ["manageCredits", "manageAdvertisements"] )) {
+        $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (" .
+            BUSINESS_ROLE . ", $i)");
+    } else {
+        $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (" .
+            ADMIN_ROLE . ", $i)");
+    }
+    $db->execute("INSERT INTO TRANSLATION
+        VALUES($i, '{$translation_info[0]}')");
+    foreach ($translation_info[1] as $locale_tag => $translation) {
+        $index = $locale_index[$locale_tag];
+        $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES ($i, $index,
+            '$translation')");
+    }
+    $i++;
+}
+$new_user_activities = [
+    "manageAccount",
+    "manageGroups",
+    "mixCrawls",
+    "groupFeeds"
+];
+foreach ($new_user_activities as $new_activity) {
+    $i = 1;
+    foreach ($activities as $key => $value) {
+        if ($new_activity == $key){
+        //give new user role the ability to have that activity
+            $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (".
+                USER_ROLE . ", $i)");
+        }
+        $i++;
+    }
+}
+$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634195',
+    'YouTube', 'video', 'http://www.youtube.com/watch?v={}',
+    'http://i1.ytimg.com/vi/{}/default.jpg', '')");
+$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634196',
+    'MetaCafe', 'video', 'http://www.metacafe.com/watch/{}',
+    'http://www.metacafe.com/thumb/{}.jpg', '')");
+$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634197',
+    'DailyMotion', 'video', 'http://www.dailymotion.com/video/{}',
+    'http://www.dailymotion.com/thumbnail/video/{}', '')");
+$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634198',
+    'Vimeo', 'video', 'http://player.vimeo.com/video/{}',
+    'http://www.yioop.com/resources/blank.png?{}', '')");
+$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634199',
+    'Break.com', 'video', 'http://www.break.com/index/{}', '" .
+    NAME_SERVER . "/resources/blank.png?{}', '')");
+$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634200',
+    'Yahoo News', 'rss', 'http://news.yahoo.com/rss/',
+    '//content/@url', 'en-US')");
+$db->execute("INSERT INTO CRAWL_MIXES VALUES (2, 'images', ".ROOT_ID.", -1)");
+$db->execute("INSERT INTO MIX_FRAGMENTS VALUES(2, 0, 1)");
+$db->execute("INSERT INTO MIX_COMPONENTS VALUES(
+    2, 0, 1, 1, 'media:image site:doc')");
+$db->execute("INSERT INTO CRAWL_MIXES VALUES (3, 'videos', ".ROOT_ID.", -1)");
+$db->execute("INSERT INTO MIX_FRAGMENTS VALUES(3, 0, 1)");
+$db->execute("INSERT INTO MIX_COMPONENTS VALUES(
+    3, 0, 1, 1, 'media:video site:doc')");
+$db->execute("INSERT INTO CRAWL_MIXES VALUES (4, 'news', ".ROOT_ID.", -1)");
+$db->execute("INSERT INTO MIX_FRAGMENTS VALUES(4, 0, 1)");
+$db->execute("INSERT INTO MIX_COMPONENTS VALUES(4, 0, 1, 1,
+    'media:news')");
+$db->execute("INSERT INTO SUBSEARCH VALUES('db_subsearch_images',
+    'images','m:2', 50)");
+$db->execute("INSERT INTO TRANSLATION VALUES (1002,'db_subsearch_images')");
+$db->execute("INSERT INTO SUBSEARCH VALUES ('db_subsearch_videos',
+    'videos','m:3', 10)");
+$db->execute("INSERT INTO TRANSLATION VALUES (1003,'db_subsearch_videos')");
+$db->execute("INSERT INTO SUBSEARCH VALUES ('db_subsearch_news',
+    'news','m:4',20)");
+$db->execute("INSERT INTO TRANSLATION VALUES (1004,'db_subsearch_news')");
+$sql = "INSERT INTO SCRAPER(NAME, SIGNATURE, SCRAPE_RULES) VALUES (?,?,?)";
+$scrapers = [
+    ["YIOOP", "/html/head/*[contains(@href,".
+        "'c=resource&amp;a=get&amp;f=css&amp;n=auxiliary.css')]",
+        "//div[contains(@class, 'body-container')]###" .
+        "//*[contains(@id, 'message')]###//*[contains(@id, 'help')]###" .
+        "//*[contains(@id, 'MathJax')]###" .
+        "//*[contains(@class, 'component-container')]###" .
+        "//*[contains(@class, 'top-bar')]###".
+        "//*[contains(@class, 'query-statistics')]###" .
+        "//*[contains(@class, 'admin-collapse')]###" .
+        "//option[not(contains(@selected, 'selected'))]###" .
+        "//*[contains(@id, 'suggest')]###//*[contains(@id, 'spell')]"],
+    ["DRUPAL", "/html/head/*[contains(@href, '/sites/all/themes') or " .
+        "contains(@href, '/sites/default/files') or ".
+        "contains(@content, 'Drupal')]",
+        "//div[@id='page']|//main" .
+        "###//*[contains(@id,'comments')]" .
+        "###//*[contains(@id,'respond')]" .
+        "###//*[contains(@class,'bottomcontainerBox')]" .
+        "###//*[contains(@class,'post-by')]" .
+        "###//*[contains(@class,'entry meta-clear')]"],
+    ["MEDIAWIKI", "//meta[contains(@content, 'MediaWiki')]",
+        "//*[contains(@id, 'mw-content-text')]###".
+        "//*[contains(@class, 'nmbox')]###//*[contains(@class, 'hatnote')]###".
+        "//*[contains(@class, 'infobox')]"],
+    ["VBULLETIN", "/html/head/*[contains(@href,'vbulletin')]",
+        "//div[contains(@class, 'body_wrapper')]###" .
+        "//*[contains(@id, 'above')]###//*[contains(@id, 'below')]###" .
+        "//*[contains(@id, 'breadcrumb')]###//*[contains(@id, 'notices')]###" .
+        "//*[contains(@id, 'footer')]###".
+        "//*[contains(@id, 'forum_info_options')]###" ."
+        //*[contains(@class, 'threadlisthead')]###" ."
+        //*[contains(@class, 'threaddetails')]###".
+        "//*[contains(@id, 'pagination')]###".
+        "//*[contains(@class, 'threadstats')]###".
+        "//*[contains(@class, 'threadlastpost')]###".
+        "//span[contains(@class, 'label')]"],
+    ["WORDPRESS", "/html/head/*[contains(@href, 'wp-content')".
+        " or contains(@href, 'wp-includes')]",
+        "//div[starts-with(@id, 'post-') and " .
+        "'post-' = translate(@id, '0123456789', '') and " .
+        "string-length(@id) >4]|//div[contains(@class, 'homepagewrapper')]###".
+        "//*[contains(@id, 'entry-comments')]###" .
+        "//*[contains(@class, 'sharedaddy')]###" .
+        "//*[contains(@class, 'blog-subscribe')]###".
+        "//*[contains(@id, 'entry-side')]"]
+    ];
+foreach ($scrapers as $scraper) {
+    $db->execute($sql, $scraper);
+}
+$subsearch_translations = [
+    'db_subsearch_images' => [
+        'en-US' => 'Images',
+        'ar' => 'لصور',
+        'fa' => 'تصاوی',
+        'fr-FR' => 'Images',
+        'nl' => 'Beelden',
+        'vi-VN' => 'Hình',
+        'zh-CN' => '图象'
+    ],
+    'db_subsearch_videos' => [
+        'en-US' => 'Videos',
+        'ar' => 'فيدي',
+        'fa' => 'ویدیوها',
+        'fr-FR' => 'Vidéos',
+        'nl' => 'Videos',
+        'vi-VN' => 'Thâu hình',
+        'zh-CN' => '录影'
+    ],
+    'db_subsearch_news' => [
+        'en-US' => 'News',
+        'ar' => 'أخبار',
+        'fa' => 'اخبا',
+        'fr-FR' => 'Actualités',
+        'nl' => 'Nieuws',
+        'vi-VN' => 'Tin tức',
+        'zh-CN' => '新闻'
+    ]
+];
+foreach ($subsearch_translations as $identifier => $locale_translations) {
+    foreach ($locale_translations as $locale_tag => $translation) {
+        L\updateTranslationForStringId($db, $identifier, $locale_tag,
+            $translation);
+    }
+}
+if (stristr(DB_HOST, "pgsql") !== false) {
+    /* For postgres count initial values of SERIAL sequences
+       will be screwed up unless do
+     */
+    $auto_tables = ["ACTIVITY" =>"ACTIVITY_ID",
+        "GROUP_ITEM" =>"GROUP_ITEM_ID", "GROUP_PAGE" => "GROUP_PAGE_ID",
+        "GROUPS" => "GROUP_ID", "LOCALE"=> "LOCALE_ID", "ROLE" => "ROLE_ID",
+        "TRANSLATION" => "TRANSLATION_ID", "USERS" => "USER_ID"];
+    foreach ($auto_tables as $table => $auto_column) {
+        $sql = "SELECT MAX($auto_column) AS NUM FROM $table";
+        $result = $db->execute($sql);
+        $row = $db->fetchArray($result);
+        $next = $row['NUM'];
+        $sequence = strtolower("{$table}_{$auto_column}_seq");
+        $sql = "SELECT setval('$sequence', $next)";
+        $db->execute($sql);
+        $sql = "SELECT nextval('$sequence')";
+        $db->execute($sql);
+    }
+}
+
+$db->disconnect();
+if (in_array(DBMS, ['sqlite','sqlite3'])){
+    chmod(CRAWL_DIR."/data/".DB_NAME.".db", 0666);
+}
+echo "Create DB succeeded\n";
--- a/src/configs/CreditConfig.php
+++ b/src/configs/CreditConfig.php
@ -0,0 +1,69 @@
+<?php
+/**
+ * SeekQuarry/Yioop -- Credit Card Configuration
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ * All rights reserved
+ */
+namespace seekquarry\yioop\configs;
+
+/**
+ * Class containing methods used to handle payment processing when keyword
+ * advertising is enabled.
+ *
+ * This class is a "blank" implementation that does not charge credit cards
+ * An implementation that uses stripe.com for payment processing can be
+ * obtained from seekquarry.com. Putting that implementation in the
+ * APP_DIR/configs/ folder would that enable real credit card processing in
+ * Yioop
+ */
+class CreditConfig
+{
+    /**
+     * Returns whether a version of CreditConfig actually capable of charging
+     * cards, receiving bitcoins, etc is in use.
+     *
+     * @return bool whether a real credit card processing class is use
+     */
+    public static function isActive()
+    {
+        return false;
+    }
+    /**
+     * Returns the URL to the credit processing Javascript library
+     * responsible for  sending securely the credit card details to the
+     * credit payment agency
+     * (for example, stripe.com) then sending along a authorization token
+     * as part of the form to the Yioop backend
+     * @return string
+     */
+    public static function getCreditTokenUrl()
+    {
+        return "";
+    }
+    /**
+     * Used to get field values from input tag with attribute name set to $name
+     * and attribute value set to value
+     * @param string $name of attribute (usually data-)
+     * @param string $value value of attribute
+     * @return string field value of the correspond input tag
+     */
+    public static function getAttribute($name, $value)
+    {
+        return "data-ignore";
+    }
+    /**
+     * Server side method that is actually responsible for charging the
+     * credit card
+     *
+     * @param float $amount dollar amount to charge the card
+     * @param string $token token issued for transaction from the card
+     *      processing agency
+     * @param string& $message message to use as for reason for charge
+     * @return bool whether or not the charge was successful
+     */
+    public static function charge($amount, $token, &$message)
+    {
+        return true;
+    }
+}
--- a/src/configs/ExportPublicHelpDb.php
+++ b/src/configs/ExportPublicHelpDb.php
@ -0,0 +1,147 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * This script can be used to export the public and help wiki pages for
+ * Yioop system to the file public_help_pages.php . This page is then
+ * used by createdb.php when creating a fresh version of the Yioop database.
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\configs;
+
+use seekquarry\yioop\library as L;
+
+ if (!empty($_SERVER['DOCUMENT_ROOT'])) {
+    echo "BAD REQUEST";
+    exit();
+}
+/** For crawlHash function */
+require_once __DIR__."/../library/Utility.php";
+$db_class = NS_DATASOURCES . ucfirst(DBMS)."Manager";
+$dbinfo = ["DBMS" => DBMS, "DB_HOST" => DB_HOST, "DB_USER" => DB_USER,
+    "DB_PASSWORD" => DB_PASSWORD, "DB_NAME" => DB_NAME];
+$db = new $db_class();
+if (!in_array(DBMS, ['sqlite', 'sqlite3'])) {
+    $db->connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME);
+} else {
+    $db->connect();
+}
+$sql = "SELECT GPH.TITLE AS TITLE, GPH.PAGE AS PAGE, ".
+    " GPH.LOCALE_TAG AS LOCALE_TAG FROM GROUP_PAGE_HISTORY GPH WHERE ".
+    " GPH.GROUP_ID='".PUBLIC_GROUP_ID."' AND GPH.LOCALE_TAG <> '' AND ".
+    " NOT EXISTS (SELECT * FROM GROUP_PAGE_HISTORY GP WHERE ".
+    " GPH.PAGE_ID=GP.PAGE_ID AND ".
+    " GPH.PUBDATE < GP.PUBDATE) ORDER BY GPH.LOCALE_TAG, GPH.TITLE";
+$result = $db->execute($sql);
+$app_config_dir = APP_DIR . "/configs";
+if (!file_exists($app_config_dir)) {
+    L\crawlLog("$app_config_dir does not exists, trying to make it...\n");
+    if (!mkdir($app_config_dir)) {
+        L\crawlLog("Make $app_config_dir failed, quitting");
+        exit();
+    }
+}
+$out_file = "$app_config_dir/PublicHelpPages.php";
+$out = "<"."?php\n";
+$out .= <<< EOD
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Default Public Wiki Pages
+ *
+ * This file should be generated using ExportPublicHelpDb.php
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+EOD;
+$out .= "\nnamespace " . substr(NS_CONFIGS, 0, -1) . ";\n\n";
+$out .= <<< EOD
+/**
+ * Public wiki pages
+ * @var array
+ */
+
+EOD;
+$out .= '$public_pages = [];'."\n";
+if ($result) {
+    while($row = $db->fetchArray($result)) {
+        $out .= '$public_pages["' . $row['LOCALE_TAG'] . '"]["' .
+            $row['TITLE'] . '"] = <<< '."'EOD'\n";
+        $out .= $row['PAGE'] ."\nEOD;\n";
+    }
+}
+$out .= "//\n// Default Help Wiki Pages\n//\n";
+$sql = "SELECT GPH.TITLE AS TITLE, GPH.PAGE AS PAGE, ".
+    " GPH.LOCALE_TAG AS LOCALE_TAG FROM GROUP_PAGE_HISTORY GPH WHERE ".
+    " GPH.GROUP_ID='".HELP_GROUP_ID."' AND GPH.LOCALE_TAG <> '' AND ".
+    " NOT EXISTS (SELECT * FROM GROUP_PAGE_HISTORY GP WHERE ".
+    " GPH.PAGE_ID=GP.PAGE_ID AND ".
+    " GPH.PUBDATE < GP.PUBDATE) ORDER BY GPH.LOCALE_TAG, GPH.TITLE";
+$result = $db->execute($sql);
+$out .= <<< EOD
+/**
+ * Help wiki pages
+ * @var array
+ */
+EOD;
+$out .= '$help_pages = [];'."\n";
+if ($result) {
+    while($row = $db->fetchArray($result)) {
+        $out .= '$help_pages["' . $row['LOCALE_TAG'] . '"]["' .
+            $row['TITLE'] . '"] = <<< '."EOD\n";
+        $out .= $row['PAGE'] ."\nEOD;\n";
+    }
+}
+$out .= "\n";
+file_put_contents($out_file, $out);
+L\crawlLog("Wrote export data to $out_file");
+
--- a/src/configs/PublicHelpPages.php
+++ b/src/configs/PublicHelpPages.php
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@ -0,0 +1,292 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * TokenTool is used to create suggest word dictionaries and 'n' word gram
+ * filter files for the Yioop! search engine.
+ *
+ * A description of its usage is given in the $usage global variable
+ *
+ *
+ * @author Ravi Dhillon  ravi.dhillon@yahoo.com, Chris Pollett (modified for n
+ *     ngrams)
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+
+namespace seekquarry\yioop\configs;
+
+use seekquarry\yioop\library\NWordGrams;
+use seekquarry\yioop\library\Trie;
+
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+ini_set("memory_limit","1500M");
+/** Load in global configuration settings */
+require_once 'Config.php';
+/**
+ * Used to print out a description of how to use TokenTool.php
+ * @var string
+ */
+$usage = <<<EOD
+TokenTool.php
+==============
+
+Usage
+=====
+TokenTool is used to create suggest word dictionaries,
+segment and 'n' word gram filter files for the Yioop! search engine.
+To create either of these items, the user
+puts a source file in Yioop's WORK_DIRECTORY/prepare folder. Suggest word
+dictionaries are used to supply the content of the dropdown of search terms
+that appears as a user is entering a query in Yioop. To make a suggest
+dictionary one can use a command like:
+
+php TokenTool.php dictionary filename locale endmarker
+
+Here filename should be in the current folder or PREP_DIR and should consist
+of one word per line, locale is the locale this suggest (for example, en-US)
+file is being made for and where a file suggest-trie.txt.gz will be written,
+and endmarker is the end of word symbol to use in the trie. For example,
+$ works pretty well.
+
+TokenTool.php can also be used to make filter files. A filter file is used to
+detect when words in a language should be treated as a unit when extracting text
+during a crawl and at search time.  For example, Bill Clinton is 2 word gram
+which should be treated as unit because it is a particular person. These
+filter files can also be used  with a segmenter which
+might be used to split Chinese or Japanese text which does not have spaces into
+a sequence of Chinese and Japanese words (which may be made out of multiple
+characters). For a nonsegmenter filter, TokenTool.php is run from the
+command line as:
+
+php TokenTool.php filter wiki_file lang locale n extract_type max_to_extract
+
+where file is a wikipedia xml file or is a bz2  compressed xml file whose urls
+or wiki page count dump file (it can also be a folder of these kind of files)
+used to determine the n-grams,
+lang is an Wikipedia language tag (ignored in segmenter case),
+locale is the IANA language tag of the locale to store the results for
+(if different from lang, for example, en-US versus en for lang), n is the
+number of words in a row to consider , extract_type is where from Wikipedia
+source to extract:
+
+0 = title's,
+1 = redirect's,
+2 = page count dump wikipedia data,
+3 = page count dump wiktionary data.
+
+For a segmenter filter, TokenTool.php is run from the
+command line as:
+
+php TokenTool.php segment-filter dictionary_file locale
+
+Here dictionary_file should be a text file with one word/line,
+locale is the IANA language tag of the locale to store the results for.
+
+
+Obtaining Data
+==============
+Many word lists are obtainable on the web for free with Creative Commons
+licenses. A good starting point is:
+http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists
+A little script-fu can generally take such a list and put it into the
+format of one word/term per line which is needed by TokenTool.php
+
+For filter file, Raw page count dumps can be found at
+http://dumps.wikimedia.org/other/pagecounts-raw/
+These probably give the best n-gram or all gram results, usually
+in a matter of minutes; nevertheless, this tool does support trying to extract
+similar data from Wikipedia dumps. This can take hours.
+
+For Wikipedia dumps, one can go to http://dumps.wikimedia.org/enwiki/
+and obtain a dump of the English Wikipedia (similar for other languages).
+This page lists all the dumps according to date they were taken. Choose any
+suitable date or the latest. A link with a label such as 20120104/, represents
+a  dump taken on  01/04/2012.  Click this link to go in turn to a page which has
+many links based on type of content you are looking for. For
+this tool you are interested in files under
+
+"Recombine all pages, current versions only".
+
+Beneath this we might find a link with a name like:
+enwiki-20120104-pages-meta-current.xml.bz2
+which is a file that could be processed by this tool.
+
+A Creative Commons licensed file which can be manipulated into a dictionary
+file suitable for Chinese segmentation can be found at:
+http://www.mdbg.net/chindict/chindict.php?page=cc-cedict
+
+EOD;
+$num_args = count($argv);
+if ( $num_args < 3 || $num_args > 8) {
+    echo $usage;
+    exit();
+}
+switch ($argv[1]) {
+    case "dictionary":
+        if (!isset($argv[3])) {
+            $argv[3] = "en-US";
+        }
+        if (!isset($argv[4])) {
+            $argv[4] = "$";
+        }
+        makeSuggestTrie($argv[2], $argv[3], $argv[4]);
+        break;
+    case "filter":
+        array_shift($argv);
+        array_shift($argv);
+        makeNWordGramsFiles($argv);
+        break;
+    case "segment-filter":
+        $file_path = PREP_DIR."/";
+        if (!file_exists($file_path.$argv[2])) {
+            echo $argv[2]." does not exist in ".$file_path;
+            exit();
+        }
+        NWordGrams::makeSegmentFilterFile($file_path.$argv[2], $argv[3]);
+        break;
+    default:
+        echo $usage;
+        exit();
+}
+if (!PROFILE) {
+    echo "Please configure the search engine instance ".
+        "by visiting its web interface on localhost.\n";
+    exit();
+}
+/**
+ * Makes an n or all word gram Bloom filter based on the supplied arguments
+ * Wikipedia files are assumed to have been place in the PREP_DIR before this
+ * is run and writes it into the resources folder of the given locale
+ *
+ * @param array $args command line arguments with first two elements of $argv
+ *     removed. For details on which arguments do what see the $usage variable
+ */
+function makeNWordGramsFiles($args)
+{
+    if (!isset($args[1])) {
+        $args[1] = "en";
+        $args[2] = "en-US";
+    }
+    if (!isset($args[2])) {
+        $args[2] = $args[1];
+    }
+    if (!isset($args[3])) {
+        $args[3] = 2; // bigrams
+    }
+    if (!isset($argv[4])) {
+        $args[4] = NWordGrams::PAGE_COUNT_WIKIPEDIA;
+    }
+    if (!isset($args[5]) && $args[3] == "all" &&
+        $args[2] == NWordGrams::PAGE_COUNT_WIKIPEDIA) {
+        $args[5] = 400000;
+    } else {
+        $args[5] = -1;
+    }
+    $wiki_file_path = PREP_DIR."/";
+    if (!file_exists($wiki_file_path.$args[0])) {
+        echo $args[0]." does not exist in $wiki_file_path";
+        exit();
+    }
+    /*
+     *This call creates a ngrams text file from input xml file and
+     *returns the count of ngrams in the text file.
+     */
+    list($num_ngrams, $max_gram_len) =
+        NWordGrams::makeNWordGramsTextFile($args[0], $args[1], $args[2],
+        $args[3], $args[4], $args[5]);
+
+    /*
+     *This call creates a bloom filter file from n word grams text file based
+     *on the language specified.The lang passed as parameter is prefixed
+     *to the filter file name. The count of n word grams in text file is passed
+     *as a parameter to set the limit of n word grams in the filter file.
+     */
+    NWordGrams::makeNWordGramsFilterFile($args[2], $args[3], $num_ngrams,
+        $max_gram_len);
+}
+
+/**
+ * Makes a trie that can be used to make word suggestions as someone enters
+ * terms into the Yioop! search box. Outputs the result into the file
+ * suggest_trie.txt.gz in the supplied locale dir
+ *
+ * @param string $dict_file where the word list is stored, one word per line
+ * @param string $locale which locale to write the suggest file to
+ * @param string $end_marker used to indicate end of word in the trie
+ */
+function makeSuggestTrie($dict_file, $locale, $end_marker)
+{
+    $locale = str_replace("-", "_", $locale);
+    $out_file = LOCALE_DIR."/$locale/resources/suggest_trie.txt.gz";
+
+    // Read and load dictionary and stop word files
+    $words = fileWithTrim($dict_file);
+    sort($words);
+    $trie = new Trie($end_marker);
+
+    /** Ignore the words in the following cases. If the word
+     *  - contains punctuation
+     *  - is less than 3 characters
+     *  - is a stop word
+     */
+    foreach ($words as $word) {
+        if (mb_ereg_match("\p{P}", $word) == 0 && mb_strlen($word) > 2) {
+            $trie->add($word);
+        }
+    }
+    $output = [];
+    $output["trie_array"] = $trie->trie_array;
+    $output["end_marker"] = $trie->end_marker;
+    file_put_contents($out_file, gzencode(json_encode($output), 9));
+}
+
+/**
+ * Reads file into an array or outputs file not found. For each entry in
+ * array trims it. Any blank lines are deleted
+ *
+ * @param $file_name file to read into array
+ * @return array of trimmed lines
+ */
+function fileWithTrim($file_name)
+{
+    if (!file_exists($file_name)) {
+        $file_name = PREP_DIR."/$file_name";
+        if (!file_exists($file_name)) {
+            echo "$file_name Not Found\n\n";
+            return [];
+        }
+    }
+    $file_string = file_get_contents($file_name);
+    $pre_lines = mb_split("\n", $file_string);
+    $lines = [];
+    foreach ($pre_lines as $pre_line) {
+        $line = preg_replace( "/(^\s+)|(\s+$)/us", "", $pre_line );
+        if ($line != "") {
+            array_push($lines, $line);
+        }
+    }
+    return $lines;
+}
--- a/src/configs/default_crawl.ini
+++ b/src/configs/default_crawl.ini
@ -0,0 +1,180 @@
+; ***** BEGIN LICENSE BLOCK *****
+;  SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer
+;  Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+;
+;  This program is free software: you can redistribute it and/or modify
+;  it under the terms of the GNU General Public License as published by
+;  the Free Software Foundation, either version 3 of the License, or
+;  (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+;  ***** END LICENSE BLOCK *****
+;
+; default_crawl.ini
+;
+; This is an example of a crawl.ini configuration file. If you mess up
+; the crawl.ini you can simply delete it and this one will be used to recreate
+; it
+;
+[general]
+arc_dir = "";
+arc_type = "";
+crawl_order = 'ad';
+crawl_type = 'ax';
+page_range_request = '50000';
+page_recrawl_frequency = '-1';
+restrict_sites_by_url = false;
+summarizer_option = 'dl';
+max_description_len = '2000';
+
+[indexed_file_types]
+extensions[] = 'unknown';
+extensions[] = 'bmp';
+extensions[] = 'doc';
+extensions[] = 'docx';
+extensions[] = 'csv';
+extensions[] = 'tab';
+extensions[] = 'tsv';
+extensions[] = 'txt';
+extensions[] = 'epub';
+extensions[] = 'asp';
+extensions[] = 'aspx';
+extensions[] = 'cgi';
+extensions[] = 'cfm';
+extensions[] = 'cfml';
+extensions[] = 'do';
+extensions[] = 'htm';
+extensions[] = 'html';
+extensions[] = 'jsp';
+extensions[] = 'php';
+extensions[] = 'pl';
+extensions[] = 'py';
+extensions[] = 'shtml';
+extensions[] = 'gif';
+extensions[] = 'xml';
+extensions[] = 'java';
+extensions[] = 'jpg';
+extensions[] = 'jpeg';
+extensions[] = 'pdf';
+extensions[] = 'png';
+extensions[] = 'ppt';
+extensions[] = 'pptx';
+extensions[] = 'py';
+extensions[] = 'rss';
+extensions[] = 'rtf';
+extensions[] = 'svg';
+extensions[] = 'xlsx';
+extensions[] = 'xml';
+
+[allowed_sites]
+url[] = 'http://www.yahoo.com/';
+url[] = 'http://www.youtube.com/';
+url[] = 'http://www.google.com/';
+
+[disallowed_sites]
+url[] = 'domain:arxiv.org';
+url[] = 'domain:ask.com';
+url[] = 'domain:yelp.com';
+url[] = 'domain:clixsense.com';
+
+[seed_sites]
+url[] = 'http://www.ucanbuyart.com/';
+url[] = 'http://www.wikipedia.org/';
+url[] = 'http://www.dmoz.org/';
+url[] = 'http://www.yahoo.com/';
+url[] = 'http://www.google.com/';
+url[] = 'http://www.amazon.com/';
+url[] = 'http://www.bing.com/';
+url[] = 'http://www.facebook.com/';
+url[] = 'http://www.blogger.com/';
+url[] = 'http://www.myspace.com/';
+url[] = 'http://www.craigslist.org/';
+url[] = 'http://www.cnn.com/';
+url[] = 'http://www.about.com/';
+url[] = 'http://www.cnet.com/';
+url[] = 'http://www.adobe.com/';
+url[] = 'http://www.mozilla.com/';
+url[] = 'http://www.weather.com/';
+url[] = 'http://www.digg.com/';
+url[] = 'http://www.zynga.com/';;
+url[] = 'http://www.download.com/';
+url[] = 'http://www.ebay.com/';
+url[] = 'http://eccc.hpi-web.de/';
+url[] = 'http://citeseerx.ist.psu.edu/';
+url[] = 'http://www.archive.org/';
+url[] = 'http://www.imdb.com/';
+url[] = 'http://www.zillow.com/';
+url[] = 'http://www.wolframalpha.com/';
+url[] = 'http://www.youtube.com/';
+url[] = 'http://www.sourceforge.net/';
+url[] = 'http://www.huffingtonpost.com/';
+url[] = 'http://www.wikimedia.org/';
+url[] = 'http://www.reference.com/';
+url[] = 'http://www.comcast.net/';
+url[] = 'http://www.dell.com/';
+url[] = 'http://www.metacafe.com/';
+url[] = 'http://www.foxnews.com/';
+url[] = 'http://www.hp.com/';
+url[] = 'http://www.stumbleupon.com';
+url[] = 'http://www.twitter.com/';
+url[] = 'http://www.wordpress.org/';
+url[] = 'http://www.bankofamerica.com/';
+url[] = 'http://www.xing.com/';
+url[] = 'http://www.microsoft.com/';
+url[] = 'http://www.mybrowserbar.com/';
+url[] = 'http://www.guardian.co.uk/';
+url[] = 'http://www.skyrock.com/';
+url[] = 'http://www.dailymail.co.uk/';
+url[] = 'http://www.ign.com/';
+url[] = 'http://www.mozilla.org/';
+url[] = 'http://www.vimeo.com/';
+url[] = 'http://www.wsj.com/';
+url[] = 'http://www.walmart.com/';
+url[] = 'http://www.reuters.com/';
+url[] = 'http://www.usps.com/';
+url[] = 'http://www.telegraph.co.uk/';
+url[] = 'http://www.babylon.com/';
+url[] = 'http://www.ups.com/';
+url[] = 'http://www.mapquest.com/';
+url[] = 'http://www.reddit.com/';
+url[] = 'http://www.theplanet.com/';
+url[] = 'http://bestbuy.com/';
+url[] = 'http://www.verizon.net/';
+url[] = 'http://www.onemanga.com/';
+url[] = 'http://www.latimes.com/';
+url[] = 'http://www.washingtonpost.com/';
+url[] = 'http://www.att.com/';
+url[] = 'http://www.w3schools.com/';
+url[] = 'http://www.fox.com/';
+url[] = 'http://www.ibm.com/';
+url[] = 'http://www.engadget.com/';
+url[] = 'http://www.usatoday.com/';
+url[] = 'http://www.chase.com/';
+url[] = 'http://www.wellsfargo.com/';
+url[] = 'http://www.nih.gov';
+url[] = 'http://www.irs.gov/';
+url[] = 'http://www.ftb.ca.gov/';
+url[] = 'http://www.monster.com/';
+url[] = 'http://www.timesonline.co.uk/';
+url[] = 'http://www.careerbuilder.com/';
+url[] = 'http://www.icq.com/';
+url[] = 'http://www.abcnews.go.com/';
+url[] = 'http://www.tmz.com/';
+url[] = 'http://www.fedex.com/';
+url[] = 'http://www.informer.com/';
+url[] = 'http://www.snopes.com/';
+url[] = 'http://www.urbandictionary.com/';
+url[] = 'http://www.slashdot.org/';
+url[] = 'http://www.php.net/';
+url[] = 'http://www.intuit.com/';
+url[] = 'http://www.thesun.co.uk/';
+
+[page_rules]
+
+[indexing_plugins]
--- a/src/controllers/AdminController.php
+++ b/src/controllers/AdminController.php
@ -0,0 +1,671 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\UrlParser;
+use seekquarry\yioop\library\PageRuleParser;
+use seekquarry\yioop\library\Classifiers\Classifier;
+use seekquarry\yioop\library\CrawlDaemon;
+
+/**
+ * Controller used to handle admin functionalities such as
+ * modify login and password, CREATE, UPDATE,DELETE operations
+ * for users, roles, locale, and crawls
+ *
+ * @author Chris Pollett
+ */
+class AdminController extends Controller implements CrawlConstants
+{
+    /**
+     * Says which activities (roughly methods invoke from the web) this
+     * controller will respond to (note: more activities will be loaded from
+     * components)
+     * @var array
+     */
+    public $activities = ["crawlStatus", "machineStatus", "signout"];
+    /**
+     * An array of activities which are periodically updated within other
+     * activities that they live. For example, within manage crawl,
+     * the current crawl status is updated every 20 or so seconds.
+     * @var array
+     */
+    public $status_activities = ["crawlStatus", "machineStatus"];
+    /**
+     * Associative array of $components activities for this controller
+     * Components are collections of activities (a little like traits) which
+     * can be reused.
+     *
+     * @var array
+     */
+    public static $component_activities = [
+        "accountaccess" =>
+            ["signin", "manageAccount", "manageUsers", "manageRoles"],
+        "crawl" => ["manageCrawls", "pageOptions", "searchSources",
+            "resultsEditor", "scrapers", "manageClassifiers"],
+        "social" => ["manageGroups", "groupFeeds", "mixCrawls", "wiki"],
+        "advertisement" => ["manageCredits", "manageAdvertisements"],
+        "system" => ["manageMachines", "manageLocales", "serverSettings",
+            "security", "appearance", "configure"]
+    ];
+    /**
+     * This is the main entry point for handling requests to administer the
+     * Yioop/SeekQuarry site
+     *
+     * ProcessRequest determines the type of request (signin , manageAccount,
+     * etc) is being made.  It then calls the appropriate method to handle the
+     * given activity. Finally, it draws the relevant admin screen
+     */
+    public function processRequest()
+    {
+        $data = [];
+        if (!C\PROFILE) {
+            return $this->configureRequest();
+        }
+        $view = "signin";
+        if (isset($_SESSION['USER_ID'])) {
+            $user = $_SESSION['USER_ID'];
+        } else {
+            $user = $_SERVER['REMOTE_ADDR'];
+        }
+        $data['SCRIPT'] = "";
+        $data[C\CSRF_TOKEN] = $this->generateCSRFToken($user);
+        $token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user);
+        if ($token_okay || isset($_REQUEST['u'])) {
+            if (isset($_SESSION['USER_ID']) && !isset($_REQUEST['u'])) {
+                $data = array_merge($data, $this->processSession());
+                if (!isset($data['REFRESH'])) {
+                    $view = "admin";
+                } else {
+                    $view = $data['REFRESH'];
+                }
+            } else if (!isset($_SESSION['REMOTE_ADDR'])
+                && !isset($_REQUEST['u'])) {
+                $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >".
+                    tl('admin_controller_need_cookies')."</h1>');";
+                unset($_SESSION['USER_ID']);
+            } else if ($this->checkSignin()) {
+                if (!isset($_SESSION['AUTH_COUNT']) ||
+                    isset($_REQUEST['round_num']) &&
+                    $_REQUEST['round_num'] < $_SESSION['AUTH_COUNT']) {
+                    $_SESSION['AUTH_COUNT'] = 0;
+                }
+                if (C\AUTHENTICATION_MODE == C\ZKP_AUTHENTICATION) {
+                    $_SESSION['AUTH_COUNT']++;
+                    if ($_SESSION['AUTH_COUNT'] != C\FIAT_SHAMIR_ITERATIONS) {
+                        $_SESSION['SALT_VALUE'] = rand(0, 1);
+                        $salt_value = $_SESSION['SALT_VALUE'];
+                        if ($_SESSION['AUTH_COUNT'] ==
+                            C\FIAT_SHAMIR_ITERATIONS - 1) {
+                            $salt_value = "done".$salt_value;
+                        }
+                        e($salt_value);
+                        exit();
+                    }
+                } else {
+                    /*
+                        if not doing Fiat Shamir pretend have gone through all
+                        needed iterations
+                     */
+                    $_SESSION['AUTH_COUNT'] = C\FIAT_SHAMIR_ITERATIONS;
+                }
+                $_SESSION['USER_NAME'] = $_REQUEST['u'];
+                // successful login.
+                if ($_SESSION['AUTH_COUNT'] == C\FIAT_SHAMIR_ITERATIONS) {
+                    $_SESSION['AUTH_COUNT'] = 0;
+                    $user_id = $this->model("signin")->getUserId(
+                        $this->clean($_REQUEST['u'], "string"));
+                    $session = $this->model("user")->getUserSession($user_id);
+                    if (isset($_SESSION['LAST_ACTIVITY']) &&
+                        is_array($_SESSION['LAST_ACTIVITY'])) {
+                        $_REQUEST = array_merge($_REQUEST,
+                            $_SESSION['LAST_ACTIVITY']);
+                    }
+                    if (is_array($session)) {
+                        $_SESSION = $session;
+                    }
+                    $allowed_activities =
+                        $this->model("user")->getUserActivities($user_id);
+                    // now don't want to use remote address anymore
+                    if (!$allowed_activities) {
+                        unset($_SESSION['USER_ID']);
+                        unset($_REQUEST);
+                        $_REQUEST['c'] = "admin";
+                        return $this->redirectWithMessage(
+                            tl('admin_controller_account_not_active'));
+                    } else {
+                        $_SESSION['USER_ID'] = $user_id;
+                        $_REQUEST[C\CSRF_TOKEN] = $this->generateCSRFToken(
+                            $_SESSION['USER_ID']);
+                        $preserve_array = [];
+                        if (!empty($_REQUEST['preserve']) &&
+                            $_REQUEST['preserve'] == 'true') {
+                            $preserve_array = [
+                                'a','arg', 'filter', 'group_id',
+                                'just_group_id', 'visible_users', 'user_filter'
+                                ];
+                        }
+                        return $this->redirectWithMessage(
+                            tl('admin_controller_login_successful'),
+                            $preserve_array);
+                    }
+                }
+            } else {
+                $alt_message = false;
+                $_SESSION['AUTH_COUNT'] = 0;
+                if (C\AUTHENTICATION_MODE == C\ZKP_AUTHENTICATION
+                    && !isset($_SESSION['AUTH_FAILED'])) {
+                    if (isset($_REQUEST['round_num'])) {
+                        $_SESSION['SALT_VALUE'] = 1;
+                        $_SESSION['AUTH_FAILED'] = -1;
+                        e($_SESSION['AUTH_FAILED']);
+                        exit();
+                    } else {
+                        unset($_SESSION['USER_ID']);
+                        unset($_SESSION['AUTH_FAILED']);
+                        unset($_REQUEST);
+                        $_REQUEST['c'] = "admin";
+                        return $this->redirectWithMessage(
+                            tl('admin_controller_no_back_button'));
+                    }
+                }
+                if (!$alt_message) {
+                    unset($_SESSION['USER_ID']);
+                    unset($_SESSION['AUTH_FAILED']);
+                    $login_attempted = false;
+                    if (isset($_REQUEST['u'])) {
+                        $login_attempted = true;
+                    }
+                    unset($_REQUEST);
+                    $_REQUEST['c'] = "admin";
+                    if ($login_attempted) {
+                        return $this->redirectWithMessage(
+                            tl('admin_controller_login_failed'));
+                    }
+                }
+            }
+        } else if ($this->checkCSRFToken(C\CSRF_TOKEN, "config")) {
+            $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >".
+                tl('admin_controller_login_to_config')."</h1>')";
+        } else if (isset($_REQUEST['a']) &&
+            in_array($_REQUEST['a'], $this->status_activities)) {
+            e("<p class='red'>".
+                tl('admin_controller_status_updates_stopped')."</p>");
+            exit();
+        }
+        if ($token_okay && isset($_SESSION["USER_ID"])) {
+            $data["ADMIN"] = true;
+        } else {
+            $data["ADMIN"] = false;
+        }
+        if ($view == 'signin') {
+            if (C\AUTHENTICATION_MODE == C\ZKP_AUTHENTICATION) {
+                $data['AUTH_ITERATION'] = C\FIAT_SHAMIR_ITERATIONS;
+                $data['FIAT_SHAMIR_MODULUS'] = C\FIAT_SHAMIR_MODULUS;
+                $_SESSION['SALT_VALUE'] = rand(0, 1);
+                $data['INCLUDE_SCRIPTS'] = ["zkp", "big_int", "sha1"];
+            } else {
+                 unset($_SESSION['SALT_VALUE']);
+            }
+            $data[C\CSRF_TOKEN] = $this->generateCSRFToken(
+                $_SERVER['REMOTE_ADDR']);
+            $data['SCRIPT'] .= "var u; if ((u = elt('username')) && u.focus) ".
+               "u.focus();";
+        }
+        $_SESSION['REMOTE_ADDR'] = $_SERVER['REMOTE_ADDR'];
+        if (!isset($data["USERNAME"]) && isset($_SESSION['USER_ID'])) {
+            $signin_model = $this->model("signin");
+            $data['USERNAME'] = $signin_model->getUserName(
+                $_SESSION['USER_ID']);
+        }
+        $this->initializeAdFields($data, false);
+        $this->displayView($view, $data);
+    }
+    /**
+     * If there is no profile/work directory set up then this method
+     * get called to by pass any login and go to the configure screen.
+     * The configure screen is only displayed if the user is connected
+     * from localhost in this case
+     */
+    public function configureRequest()
+    {
+        $data = $this->processSession();
+        $data[C\CSRF_TOKEN] = $this->generateCSRFToken("config");
+        $this->displayView("admin", $data);
+    }
+    /**
+     * Checks whether the user name and password sent presumably by the signin
+     * form match a user in the database
+     *
+     * @return bool whether they do or not
+     */
+    public function checkSignin()
+    {
+        if (C\AUTHENTICATION_MODE == C\NORMAL_AUTHENTICATION) {
+            $result = false;
+            if (isset($_REQUEST['u']) && isset($_REQUEST['p']) ) {
+                $result = $this->model("signin")->checkValidSignin(
+                    $this->clean($_REQUEST['u'], "string"),
+                    $this->clean($_REQUEST['p'], "string") );
+            }
+        } else {
+            if (!isset($_REQUEST['u']) || !isset($_REQUEST['x']) ||
+                !isset($_REQUEST['y']) || !isset($_SESSION['SALT_VALUE']) ||
+                isset($_SESSION['AUTH_FAILED'])) {
+                $result = false;
+            } else {
+                $result = $this->model("signin")->checkValidSigninForZKP(
+                    $this->clean($_REQUEST['u'], "string"),
+                    $this->clean($_REQUEST['x'], "string"),
+                    $this->clean($_REQUEST['y'], "string"),
+                    $_SESSION['SALT_VALUE'], C\FIAT_SHAMIR_MODULUS);
+            }
+            if (!$result) {
+                $_SESSION['AUTH_COUNT'] = 0;
+            }
+        }
+        return $result;
+    }
+    /**
+     * Determines the user's current allowed activities and current activity,
+     * then calls the method for the latter.
+     *
+     * This is called from {@link processRequest()} once a user is logged in.
+     *
+     * @return array $data the results of doing the activity for display in the
+     *     view
+     */
+    public function processSession()
+    {
+        $allowed = false;
+        if (!C\PROFILE || (C\nsdefined("FIX_NAME_SERVER") &&
+            C\FIX_NAME_SERVER)) {
+            $activity = "configure";
+        } else if (isset($_REQUEST['a']) &&
+            in_array($_REQUEST['a'], $this->activities)) {
+            $activity = $_REQUEST['a'];
+        } else {
+            $activity = "manageAccount";
+        }
+        $activity_model = $this->model("activity");
+        if (!C\PROFILE) {
+            $allowed_activities = [ [
+                "ACTIVITY_NAME" =>
+                $activity_model->getActivityNameFromMethodName($activity),
+                'METHOD_NAME' => $activity]];
+            $allowed = true;
+        } else {
+            $allowed_activities =
+                 $this->model("user")->getUserActivities($_SESSION['USER_ID']);
+        }
+        if ($allowed_activities == []) {
+            $data['INACTIVE'] = true;
+            return $data;
+        }
+        foreach ($allowed_activities as $allowed_activity) {
+            if ($activity == $allowed_activity['METHOD_NAME']) {
+                 $allowed = true;
+            }
+            if ($allowed_activity['METHOD_NAME'] == "manageCrawls" &&
+                $activity == "crawlStatus") {
+                $allowed = true;
+            }
+            if ($allowed_activity['METHOD_NAME'] == "manageMachines" &&
+                $activity == "machineStatus") {
+                $allowed = true;
+            }
+            if ($allowed_activity['METHOD_NAME'] == "groupFeeds" &&
+                $activity == "wiki") {
+                $allowed = true;
+            }
+        }
+        // business role only allows managing advertisements;
+        if (!$allowed && $activity == "manageAccount") {
+            $activity = $allowed_activities[0]['METHOD_NAME'];
+            $_REQUEST["a"] = $activity;
+            $allowed = true;
+        }
+        //for now we allow anyone to get crawlStatus
+        if ($allowed) {
+            $data = $this->call($activity);
+            $data['ACTIVITY_METHOD'] = $activity; //for settings controller
+            if (!is_array($data)) {
+                $data = [];
+            }
+            $data['ACTIVITIES'] = $allowed_activities;
+        }
+        if (!in_array($activity, $this->status_activities)) {
+            $name_activity = $activity;
+            if ($activity == "wiki") {
+                $name_activity = "groupFeeds";
+            }
+            $data['CURRENT_ACTIVITY'] =
+                $activity_model->getActivityNameFromMethodName($name_activity);
+            if (!empty($_REQUEST['TOGGLE_ACTIVITIES'])) {
+                $_SESSION['HIDE_ACTIVITIES'] = (empty(
+                    $_SESSION['HIDE_ACTIVITIES'])) ? true : false;
+                $this->model("user")->setUserSession($_SESSION['USER_ID'],
+                    $_SESSION);
+            }
+            $data['HIDE_ACTIVITIES'] = empty($_SESSION['HIDE_ACTIVITIES']) ?
+                false : true;
+        }
+        $data['COMPONENT_ACTIVITIES'] = [];
+        $component_translations = [
+            "accountaccess" => tl('admin_controller_account_access'),
+            "social" => tl('admin_controller_social'),
+            "crawl" => tl('admin_controller_crawl_settings'),
+            "system" => tl('admin_controller_system_settings'),
+            "advertisement" => tl('admin_controller_advertisement')
+        ];
+        if (isset($data["ACTIVITIES"])) {
+            foreach (self::$component_activities as $component => $activities){
+                foreach ($data["ACTIVITIES"] as $activity) {
+                    if (in_array($activity['METHOD_NAME'], $activities)) {
+                        $data['COMPONENT_ACTIVITIES'][
+                            $component_translations[$component]][] =
+                            $activity;
+                    }
+                }
+            }
+        }
+        return $data;
+    }
+    /**
+     * Used to handle crawlStatus REST activities requesting the status of the
+     * current web crawl
+     *
+     * @return array $data contains crawl status of current crawl as well as
+     *     info about prior crawls and which crawl is being used for default
+     *     search results
+     */
+    public function crawlStatus()
+    {
+        $data = [];
+        $data['REFRESH'] = "crawlstatus";
+        $crawl_model = $this->model("crawl");
+        $crawl_time = $crawl_model->getCurrentIndexDatabaseName();
+        if (isset($crawl_time) ) {
+            $data['CURRENT_INDEX'] = (int)$crawl_time;
+        } else {
+            $data['CURRENT_INDEX'] = -1;
+        }
+        $machine_urls = $this->model("machine")->getQueueServerUrls();
+        list($stalled, $status, $data['RECENT_CRAWLS']) =
+            $crawl_model->combinedCrawlInfo($machine_urls);
+        if ($stalled) {
+            $crawl_model->sendStopCrawlMessage($machine_urls);
+        }
+        $data = array_merge($data, $status);
+        $data["CRAWL_RUNNING"] = false;
+        if (!empty($data['CRAWL_TIME'])) {
+            //erase from previous crawl list any active crawl
+            $num_crawls = count($data['RECENT_CRAWLS']);
+            for ($i = 0; $i < $num_crawls; $i++) {
+                if ($data['RECENT_CRAWLS'][$i]['CRAWL_TIME'] ==
+                    $data['CRAWL_TIME']) {
+                    $data['RECENT_CRAWLS'][$i] = false;
+                }
+            }
+            $data["CRAWL_RUNNING"] = true;
+            $data['RECENT_CRAWLS']= array_filter($data['RECENT_CRAWLS']);
+        }
+        if (isset($data['RECENT_CRAWLS'][0])) {
+            L\rorderCallback($data['RECENT_CRAWLS'][0],
+                $data['RECENT_CRAWLS'][0], 'CRAWL_TIME');
+            usort($data['RECENT_CRAWLS'], C\NS_LIB . "rorderCallback");
+        }
+        $this->pagingLogic($data, 'RECENT_CRAWLS', 'RECENT_CRAWLS',
+            C\DEFAULT_ADMIN_PAGING_NUM);
+        return $data;
+    }
+    /**
+     * Gets data from the machine model concerning the on/off states
+     * of the machines managed by this Yioop instance and then passes
+     * this data the the machinestatus view.
+     * @return array $data MACHINES field has information about each
+     *     machine managed by this Yioop instance as well the on off
+     *     status of its queue_servers and fetchers.
+     *     The REFRESH field is used to tell the controller that the
+     *     view shouldn't have its own sidemenu.
+     */
+    public function machineStatus()
+    {
+        $data = [];
+        $data['REFRESH'] = "machinestatus";
+        $this->pagingLogic($data, $this->model("machine"), 'MACHINES',
+            C\DEFAULT_ADMIN_PAGING_NUM);
+        $profile =  $this->model("profile")->getProfile(C\WORK_DIRECTORY);
+        $media_mode = isset($profile['MEDIA_MODE']) ?
+            $profile['MEDIA_MODE']: "name_server";
+        $data['MEDIA_MODE'] = $media_mode;
+        if ($data['MEDIA_MODE'] == "name_server" &&
+            $data['MACHINES']['NAME_SERVER']["MEDIA_UPDATER_TURNED_ON"] &&
+            $data['MACHINES']['NAME_SERVER']["MediaUpdater"] == 0) {
+            // try to restart news server if dead
+            CrawlDaemon::start("MediaUpdater", 'none', "", -1);
+        }
+        return $data;
+    }
+    /**
+     * Used to update the yioop installation profile based on $_REQUEST data
+     *
+     * @param array& $data field data to be sent to the view
+     * @param array& $profile used to contain the current and updated profile
+     *     field values
+     * @param array $check_box_fields fields whose data comes from a html
+     *     checkbox
+     */
+    public function updateProfileFields(&$data, &$profile,
+        $check_box_fields = [])
+    {
+        $script_array = ['SIDE_ADSCRIPT', 'TOP_ADSCRIPT', 'GLOBAL_ADSCRIPT'];
+        foreach ($script_array as $value) {
+            if (isset($_REQUEST[$value])) {
+                $_REQUEST[$value] = str_replace("(","&#40;",$_REQUEST[$value]);
+                $_REQUEST[$value] = str_replace(")","&#41;",$_REQUEST[$value]);
+            }
+        }
+        $color_fields = ['BACKGROUND_COLOR', 'FOREGROUND_COLOR',
+            'SIDEBAR_COLOR', 'TOPBAR_COLOR'];
+        foreach ($this->model("profile")->profile_fields as $field) {
+            if (isset($_REQUEST[$field])) {
+                if ($field != "ROBOT_DESCRIPTION" &&
+                    $field != "MEMCACHE_SERVERS" &&
+                    $field != "PROXY_SERVERS") {
+                    if (in_array($field, $color_fields)) {
+                        $clean_value =
+                            $this->clean($_REQUEST[$field], "color");
+                    } else {
+                        $clean_value =
+                            $this->clean($_REQUEST[$field], "string");
+                    }
+                } else {
+                    $clean_value = $_REQUEST[$field];
+                }
+                if ($field == "NAME_SERVER" &&
+                    $clean_value[strlen($clean_value) -1] != "/") {
+                    $clean_value .= "/";
+                }
+                $data[$field] = $clean_value;
+                $profile[$field] = $data[$field];
+                if ($field == "MEMCACHE_SERVERS" || $field == "PROXY_SERVERS"){
+                    $mem_array = preg_split("/(\s)+/", $clean_value);
+                    $profile[$field] =
+                        $this->convertArrayLines(
+                            $mem_array, "|Z|", true);
+                }
+            }
+            if (!isset($data[$field])) {
+                if (defined($field) && !in_array($field, $check_box_fields)) {
+                    $data[$field] = constant($field);
+                } else {
+                    $data[$field] = "";
+                }
+                if (in_array($field, $check_box_fields)) {
+                    $profile[$field] = false;
+                }
+            }
+        }
+    }
+    /**
+     * Used to set up view data for table search form (might make use of
+     * $_REQUEST if form was submitted, results gotten, and we want to preserve
+     * form drop down). Table search forms
+     * are used by manageUsers, manageRoles, manageGroups, to do advanced
+     * search of the entity they are responsible for.
+     *
+     * @param array& $data modified to contain the field data needed for
+     *     the view to draw the search form
+     * @param string activity in which this search is being conducted
+     * @param array $comparison_fields those fields of the entity
+     *     in question ( for example, users) which we can search both with
+     *     string comparison operators and equality operators
+     * @param array $equal_comparison_fields those fields of the entity in
+     *     question which can only be search by equality/inequality operators
+     * @param string $field_postfix suffix to append onto field names in
+     *     case there are multiple forms on the same page
+     */
+    public function tableSearchRequestHandler(&$data, $activity,
+        $comparison_fields = [], $equal_comparison_fields = [],
+        $field_postfix = "")
+    {
+        $data['FORM_TYPE'] = "search";
+        $activity_postfix = $activity . $field_postfix;
+        $data['COMPARISON_TYPES'] = [
+            "=" => tl('admin_controller_equal'),
+            "!=" => tl('admin_controller_not_equal'),
+            "CONTAINS" => tl('admin_controller_contains'),
+            "BEGINS WITH" => tl('admin_controller_begins_with'),
+            "ENDS WITH" => tl('admin_controller_ends_with'),
+        ];
+        $_SESSION['SEARCH'][$activity_postfix]['COMPARISON_TYPES'] =
+            $data['COMPARISON_TYPES'];
+        $data['EQUAL_COMPARISON_TYPES'] = [
+            "=" => tl('admin_controller_equal'),
+            "!=" => tl('admin_controller_not_equal'),
+        ];
+        $_SESSION['SEARCH'][$activity_postfix]['EQUAL_COMPARISON_TYPES'] =
+            $data['EQUAL_COMPARISON_TYPES'];
+        $data['SORT_TYPES'] = [
+            "NONE" => tl('admin_controller_no_sort'),
+            "ASC" => tl('admin_controller_sort_ascending'),
+            "DESC" => tl('admin_controller_sort_descending'),
+        ];
+        $_SESSION['SEARCH'][$activity_postfix]['SORT_TYPES'] =
+            $data['SORT_TYPES'];
+        $paging = "";
+        foreach ($comparison_fields as $comparison_start) {
+            $comparison = $comparison_start."_comparison";
+            $comparison_types = (in_array($comparison_start,
+                 $equal_comparison_fields))
+                ? 'EQUAL_COMPARISON_TYPES' : 'COMPARISON_TYPES';
+            $data[$comparison] = (isset($_REQUEST[$comparison]) &&
+                isset($data[$comparison_types][
+                $_REQUEST[$comparison]])) ? $_REQUEST[$comparison] :
+                "=";
+            $_SESSION['SEARCH'][$activity_postfix]['COMPARISON_FIELDS'
+                ][$comparison] = $data[$comparison];
+            $paging .= "&amp;$comparison=".
+                urlencode($data[$comparison]);
+        }
+        foreach ($comparison_fields as $sort_start) {
+            $sort = $sort_start."_sort";
+            $data[$sort] = (isset($_REQUEST[$sort]) &&
+                isset($data['SORT_TYPES'][
+                $_REQUEST[$sort]])) ? $_REQUEST[$sort] :
+                "NONE";
+            $_SESSION['SEARCH'][$activity_postfix]['SORT'][$sort] =
+                $data[$sort];
+            $paging .= "&amp;$sort=".urlencode($data[$sort]);
+        }
+        $search_array = [];
+        foreach ($comparison_fields as $field) {
+            $field_name = $field.$field_postfix;
+            $field_comparison = $field."_comparison";
+            $field_sort = $field."_sort";
+            $data[$field_name] = (isset($_REQUEST[$field_name])) ?
+                $this->clean($_REQUEST[$field_name], "string") :
+                "";
+            $_SESSION['SEARCH'][$activity_postfix]['FIELD_NAMES'
+                ][$field_name] = $data[$field_name];
+            if ($field_name=='access' && $data[$field_name] >= 10) {
+                $search_array[] = ["status",
+                    $data[$field_comparison], $data[$field_name]/10,
+                    $data[$field_sort]];
+            } else {
+                $search_array[] = [$field,
+                    $data[$field_comparison], $data[$field_name],
+                    $data[$field_sort]];
+            }
+            $paging .= "&amp;$field_name=".
+                urlencode($data[$field_name]);
+        }
+        $data['PAGING'] = $paging;
+        $_SESSION['SEARCH'][$activity_postfix]['SEARCH_ARRAY'] =
+            $search_array;
+        $_SESSION['SEARCH'][$activity_postfix]['PAGING'] =
+            $data['PAGING'];
+        return $search_array;
+    }
+    /**
+     * For activity involving items for which one can do search (user, group,
+     * roles) this method is used to marshal the last search that was performed
+     * out of the session when one navigates back to search
+     *
+     * @param array &$data field variables used by view to draw itself
+     * @param string $activity current activity marshalling last search for
+     * @param string $field_postfix some activities support multiple search
+     *   forms. The field postfix is used to select among these.
+     */
+    function restoreLastSearchFromSession(&$data, $activity,
+        $field_postfix = "")
+    {
+        $activity_postfix = $activity . $field_postfix;
+        if (empty($_SESSION['LAST_SEARCH'][$activity_postfix])) {
+            return;
+        }
+        $last_search = $_SESSION['LAST_SEARCH'][$activity_postfix];
+        foreach (['COMPARISON_TYPES', 'EQUAL_COMPARISON_TYPES',
+            'SORT_TYPES', 'SEARCH_ARRAY', 'PAGING'] as $field) {
+            $data[$field] = (empty($last_search[$field])) ? [] :
+                $last_search[$field];
+        }
+        foreach (['COMPARISON_FIELDS', 'SORT', 'FIELD_NAMES'] as $field) {
+            foreach ($last_search[$field] as $name => $value) {
+                $data[$name] = $value;
+            }
+        }
+        return $data['SEARCH_ARRAY'];
+    }
+}
--- a/src/controllers/ApiController.php
+++ b/src/controllers/ApiController.php
@ -0,0 +1,115 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ *  @author Eswara Rajesh Pinapala epinapala@live.com
+ *  @license http://www.gnu.org/licenses/ GPL3
+ *  @link http://www.seekquarry.com/
+ *  @copyright 2009 - 2017
+ *  @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\WikiPaser;
+
+/**
+ * Controller used to handle user group activities outside of
+ * the admin panel setting. This either could be because the admin panel
+ * is "collapsed" or because the request concerns a wiki page.
+ *
+ * @author Eswara Rajesh Pinapala
+ */
+class ApiController extends Controller implements CrawlConstants
+{
+    /**
+     * Associative array of $components activities for this controller
+     * Components are collections of activities (a little like traits) which
+     * can be reused.
+     *
+     * @var array
+     */
+    public static $component_activities = [  "social" => ["wiki"] ];
+    /**
+     * Used to process requests related to user group activities outside of
+     * the admin panel setting. This either could be because the admin panel
+     * is "collapsed" or because the request concerns a wiki page.
+     */
+    public function processRequest()
+    {
+        $data = [];
+        if (!C\PROFILE) {
+            return $this->configureRequest();
+        }
+        if (isset($_SESSION['USER_ID'])) {
+            $user_id = $_SESSION['USER_ID'];
+            $data['ADMIN'] = 1;
+        } else {
+            $user_id = $_SERVER['REMOTE_ADDR'];
+        }
+        $data['SCRIPT'] = "";
+        $token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user_id);
+
+        $data = array_merge($data, $this->processSession());
+
+        if (isset($data["VIEW"])) {
+            $view = $data["VIEW"];
+        } else {
+            $view = 'api';
+        }
+        $_SESSION['REMOTE_ADDR'] = $_SERVER['REMOTE_ADDR'];
+        $this->displayView($view, $data);
+    }
+    /**
+     * Used to perform the actual activity call to be done by the
+     * api_controller.
+     * processSession is called from @see processRequest, which does some
+     * cleaning of fields if the CSRFToken is not valid. It is more likely
+     * that that api_controller may be involved in such requests as it can
+     * be invoked either when a user is logged in or not and for users with and
+     * without accounts. processSession makes sure the $_REQUEST'd activity is
+     * valid (or falls back to groupFeeds) then calls it. If someone uses
+     * the Settings link to change the language or default number of feed
+     * elements to view, this method sets up the $data variable so that
+     * the back/cancel button on that page works correctly.
+     */
+    public function processSession()
+    {
+        if (isset($_REQUEST['a']) &&
+                in_array($_REQUEST['a'], $this->activities)) {
+            $activity = $this->clean($_REQUEST['a'],"string");
+        } else {
+            $activity = "groupFeeds";
+        }
+        $data = $this->call($activity);
+        $data['ACTIVITY_CONTROLLER'] = "group";
+        $data['PAGE_TITLE'] = $this->clean($_REQUEST['page_name'],"string");
+        $data['ACTIVITY_METHOD'] = $activity; //for settings controller
+        if (!is_array($data)) {
+            $data = [];
+        }
+        return $data;
+    }
+}
+
--- a/src/controllers/ArchiveController.php
+++ b/src/controllers/ArchiveController.php
@ -0,0 +1,92 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\WebArchiveBundle;
+
+/**
+ * Fetcher machines also act as archives for complete caches of web pages,
+ * this controller is used to handle access to these web page caches
+ *
+ * @author Chris Pollett
+ */
+class ArchiveController extends Controller implements CrawlConstants
+{
+    /**
+     * The only legal activity this controller will accept is a request
+     * for the cache of a web page
+     * @var array
+     */
+    public $activities = ["cache"];
+
+    /**
+     * Main method for this controller to handle requests. It first checks
+     * the request is valid, and then handles the corresponding activity
+     *
+     * For this controller the only activity is to handle a cache request
+     */
+    public function processRequest()
+    {
+        $data = [];
+        /* do a quick test to see if this is a request seems like from a
+           legitimate machine
+         */
+        if (!$this->checkRequest()) {return; }
+        $activity = $this->clean($_REQUEST['a'], "string");
+        $this->call($activity);
+    }
+    /**
+     * Retrieves the requested page from the WebArchiveBundle and echo it page,
+     * base64 encoded
+     */
+    public function cache()
+    {
+        $offset = $this->clean($_REQUEST['offset'], "int");
+        $partition = $this->clean($_REQUEST['partition'], "int");
+        $crawl_time = $this->clean($_REQUEST['crawl_time'], "string");
+        $prefix = "";
+        if (isset($_REQUEST['instance_num'])) {
+            $prefix = $this->clean($_REQUEST['instance_num'], "int")."-";
+        }
+        if (file_exists(C\CRAWL_DIR.'/cache/'.$prefix.self::archive_base_name.
+                $crawl_time)) {
+            $web_archive = new WebArchiveBundle(
+                C\CRAWL_DIR.'/cache/'.$prefix.self::archive_base_name.
+                    $crawl_time);
+            $page = $web_archive->getPage($offset, $partition);
+            echo base64_encode(serialize($page));
+        } else {
+            echo base64_encode(serialize(false));
+        }
+    }
+}
--- a/src/controllers/ClassifierController.php
+++ b/src/controllers/ClassifierController.php
@ -0,0 +1,315 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\UrlParser;
+use seekquarry\yioop\library\classifiers\Classifier;
+use seekquarry\yioop\library\archive_bundle_iterators\MixArchiveBundleIterator;
+
+/**
+ * This class handles XmlHttpRequests to label documents during classifier
+ * construction.
+ *
+ * Searching for new documents to label and add to the training set is a
+ * heavily-interactive operation, so it is implemented using asynchronous
+ * requests to this controller in order to fetch candidates for labeling and
+ * add labels without reloading the classifier edit page. The admin controller
+ * takes care of first displaying the "edit classifier" page, and handles
+ * requests to change a classifier's class label, but this controller handles
+ * the other asynchronous requests issued by the JavaScript on the page.
+ *
+ * @author Shawn Tice
+ */
+class ClassifierController extends Controller implements CrawlConstants
+{
+    /**
+     * These are the activities supported by this controller
+     * @var array
+     */
+    public $activities = ["classify"];
+    /**
+     * Checks that the request seems to be coming from a legitimate, logged-in
+     * user, then dispatches to the appropriate activity.
+     */
+    public function processRequest()
+    {
+        if (!isset($_REQUEST['a']) || !$this->checkRequest()) {return;}
+        $activity = $_REQUEST['a'];
+        if (in_array($activity, $this->activities)) {
+            $this->call($activity);
+        }
+    }
+    /**
+     * Finds the next document for which to request a label, sometimes first
+     * recording the label that the user selected for the last document. This
+     * method should only be called via an XmlHttpRequest initiated by the edit
+     * classifier JavaScript, and consequently it always writes out
+     * JSON-encoded data, which is easily decoded by the page JavaScript.
+     */
+    public function classify()
+    {
+        $arg = $this->clean($_REQUEST['arg'], 'string');
+        $label = $this->clean($_REQUEST['label'], 'string');
+
+        if (isset($_REQUEST['index'])) {
+            $index = $this->clean($_REQUEST['index'], 'int');
+            if (intval($index) == 1) {
+                $index = $this->model("crawl")->getCurrentIndexDatabaseName();
+            }
+            $source_type = $this->clean($_REQUEST['type'], 'string');
+            $keywords = $this->clean($_REQUEST['keywords'], 'string');
+        }
+        /*
+           The call to prepareToLabel is important; it loads all of the data
+           required to manage the training set from disk, and also determines
+           what will be saved *back* to disk later.
+         */
+        $classifier = Classifier::getClassifier($label);
+        $classifier->prepareToLabel();
+        $data = [];
+        switch ($arg) {
+            case 'getdocs':
+                /*
+                   Load documents in from a user-specified index, and find the
+                   next best one to label (for 'manual' source type), or label
+                   them all with a single label (for either the 'positive' or
+                   'negative' source types).
+                 */
+                $mix_iterator = $this->buildClassifierCrawlMix(
+                    $label, $index, $keywords);
+                if ($source_type == 'manual') {
+                    $num_docs = $classifier->initBuffer($mix_iterator);
+                    $classifier->computeBufferDensities();
+                    $data['num_docs'] = $num_docs;
+                    list($new_doc, $disagreement) =
+                        $classifier->findNextDocumentToLabel();
+                    if ($new_doc) {
+                        $score = $classifier->classify($new_doc);
+                        $data['new_doc'] = $this->prepareUnlabelledDocument(
+                            $new_doc, $score, $disagreement,
+                            $index, $keywords);
+                    }
+                    Classifier::setClassifier($classifier);
+                } else if ($source_type == 'positive' ||
+                    $source_type == 'negative') {
+                    $doc_label = ($source_type == 'positive') ? 1 : -1;
+                    $add_count = $classifier->addAllDocuments(
+                        $mix_iterator, $doc_label);
+                    if ($add_count > 0) {
+                        /*
+                           Pass true to always update accuracy after adding a
+                           batch of documents all at once.
+                         */
+                        $classifier->train(true);
+                        Classifier::setClassifier($classifier);
+                    }
+                    $data['add_count'] = $add_count;
+                }
+                break;
+            case 'addlabel':
+                /*
+                   First label the last candidate document presented to the
+                   user (potentially skipping it instead of actually applying a
+                   label), then pick the next best candidate for labeling.
+                   When skipping a document instead of adding a label, avoid
+                   re-training since the training set hasn't actually changed.
+                 */
+                $doc = $_REQUEST['doc_to_label'];
+                $docid = $this->clean($doc['docid'], 'int');
+                $key = L\webdecode($this->clean($doc['key'], 'string'));
+                $doc_label = $this->clean($doc['label'], 'int');
+                $mix_iterator = $this->retrieveClassifierCrawlMix($label);
+                $labels_changed = $classifier->labelDocument($key, $doc_label);
+                $num_docs = $classifier->refreshBuffer($mix_iterator);
+                $classifier->computeBufferDensities();
+                $data['num_docs'] = $num_docs;
+                if ($labels_changed) {
+                    $update_accuracy = $classifier->total > 0 &&
+                        $classifier->total % 10 == 0;
+                    $classifier->train($update_accuracy);
+                }
+                list($new_doc, $disagreement) =
+                    $classifier->findNextDocumentToLabel();
+                if ($new_doc) {
+                    $score = $classifier->classify($new_doc);
+                    $data['new_doc'] = $this->prepareUnlabelledDocument(
+                        $new_doc, $score, $disagreement,
+                        $index, $keywords);
+                }
+                Classifier::setClassifier($classifier);
+                break;
+            case 'updateaccuracy':
+                /*
+                   Don't do anything other than re-compute the accuracy for the
+                   current training set.
+                 */
+                $classifier->updateAccuracy();
+                Classifier::setClassifier($classifier);
+                break;
+        }
+
+        /*
+           No matter which activity we ended up carrying out, always include
+           the statistics that *might* have changed so that the client can just
+           naively keep them up to date.
+         */
+        $data['positive'] = $classifier->positive;
+        $data['negative'] = $classifier->negative;
+        $data['total'] = $classifier->total;
+        $data['accuracy'] = $classifier->accuracy;
+
+        /*
+           Pass along a new authentication token so that the client can make a
+           new authenticated request after this one.
+         */
+        $data['authTime'] = strval(time());
+        $data['authSession'] = md5($data['authTime'] . C\AUTH_KEY);
+
+        $response = json_encode($data);
+        header("Content-Type: application/json");
+        header("Content-Length: ".strlen($response));
+        echo $response;
+    }
+    /* PRIVATE METHODS */
+    /**
+     * Creates a new crawl mix for an existing index, with an optional query,
+     * and returns an iterator for the mix. The crawl mix name is derived from
+     * the class label, so that it can be easily retrieved and deleted later
+     * on.
+     *
+     * @param string $label class label of the classifier the new crawl mix
+     * will be associated with
+     * @param int $crawl_time timestamp of the index to be iterated over
+     * @param string $keywords an optional query used to restrict the pages
+     * retrieved by the crawl mix
+     * @return object A MixArchiveBundleIterator instance that will iterate
+     * over the pages of the requested index
+     */
+    public function buildClassifierCrawlMix($label, $crawl_time, $keywords)
+    {
+        $crawl_model = $this->model("crawl");
+        $mix_time = time();
+        $mix_name = Classifier::getCrawlMixName($label);
+
+        // Replace any existing crawl mix.
+        $old_time = $crawl_model->getCrawlMixTimestamp($mix_name);
+        if ($old_time) {
+            $crawl_model->deleteCrawlMixIteratorState($old_time);
+            $crawl_model->deleteCrawlMix($old_time);
+        }
+
+        $crawl_model->setCrawlMix(array(
+            'TIMESTAMP' => $mix_time,
+            'NAME' => $mix_name,
+            'OWNER_ID' => $_SESSION['USER_ID'],
+            'PARENT' => -1,
+            'FRAGMENTS' => [
+                ['RESULT_BOUND' => 1,
+                 'COMPONENTS' => [[
+                    'CRAWL_TIMESTAMP' => $crawl_time,
+                    'WEIGHT' => 1.0,
+                    'KEYWORDS' => $keywords]]]]));
+        return new MixArchiveBundleIterator($mix_time, $mix_time);
+    }
+    /**
+     * Retrieves an iterator for an existing crawl mix. The crawl mix remembers
+     * its previous offset, so that the new iterator picks up where the
+     * previous one left off.
+     *
+     * @param string $label class label of the classifier this crawl mix is
+     * associated with
+     * @return object new MixArchiveBundleIterator instance that picks up where
+     * the previous one left off
+     */
+    public function retrieveClassifierCrawlMix($label)
+    {
+        $mix_name = Classifier::getCrawlMixName($label);
+        $mix_time = $this->model("crawl")->getCrawlMixTimestamp($mix_name);
+        return new MixArchiveBundleIterator($mix_time, $mix_time);
+    }
+    /**
+     * Creates a fresh array from an existing page summary array, and augments
+     * it with extra data relevant to the labeling interface on the client.
+     *
+     * @param array $page original page summary array
+     * @param float $score classification score (estimated by the Naive Bayes
+     * text classification algorithm) for $page
+     * @param float $disagreement disagreement score computed for $page
+     * @param int $crawl_time index the page came from
+     * @param string $keywords query supplied to the crawl mix used to find
+     * $page
+     * @return array reduced page summary structure containing only the
+     * information that the client needs to display a summary of the page
+     */
+    public function prepareUnlabelledDocument($page, $score, $disagreement,
+        $crawl_time, $keywords)
+    {
+        $phrase_model = $this->model("phrase");
+        // Highlight the query keywords, if any.
+        $disjunct_phrases = explode("|", $keywords);
+        $words = [];
+        foreach ($disjunct_phrases as $disjunct_phrase) {
+            list($word_struct, $format_words) =
+                $phrase_model->parseWordStructConjunctiveQuery(
+                    $disjunct_phrase);
+            $words = array_merge($words, $format_words);
+        }
+        $title = $phrase_model->boldKeywords(
+            $page[self::TITLE], $words);
+        $description = $phrase_model->getSnippets(
+            strip_tags($page[self::DESCRIPTION]), $words, 400);
+        $description = $phrase_model->boldKeywords(
+            $description, $words);
+        $cache_link = "?c=search&amp;a=cache".
+            "&amp;q=".urlencode($keywords).
+            "&amp;arg=".urlencode($page[self::URL]).
+            "&amp;its=".$crawl_time;
+        /*
+           Note that the confidence is a transformation of the score that
+           converts it into a value between 0 and 1, where it's 0 if the score
+           was exactly 0.5, and increases toward 1 as the score either
+           increases toward 1 or decreases toward 0.
+         */
+        return [
+            'title' => $title,
+            'url' => $page[self::URL],
+            'key' => L\webencode(Classifier::makeKey($page)),
+            'cache_link' => $cache_link,
+            'description' => $description,
+            'score' => $score,
+            'positive' => $score >= 0.5 ? 1 :0,
+            'confidence' => abs($score - 0.5) / 0.5,
+            'disagreement' => $disagreement];
+    }
+}
--- a/src/controllers/Controller.php
+++ b/src/controllers/Controller.php
@ -0,0 +1,964 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop as B;
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\AnalyticsManager;
+use seekquarry\yioop\library\UrlParser;
+use seekquarry\yioop\models\Model;
+use seekquarry\yioop\controllers\components\Component;
+use seekquarry\yioop\views\View;
+
+/**
+ * Load crawlHash  and timing functions
+ */
+require_once C\BASE_DIR."/library/Utility.php";
+/** For guessing locale and formatting date based on locale guessed*/
+require_once C\BASE_DIR."/library/LocaleFunctions.php";
+/**
+ * Translate the supplied arguments into the current locale.
+ *
+ * This function is a convenience copy of the same function
+ * @see seekquarry\yioop\library\tl() to this subnamespace
+ *
+ * @param string string_identifier  identifier to be translated
+ * @param mixed additional_args  used for interpolation in translated string
+ * @return string  translated string
+ */
+function tl()
+{
+    return call_user_func_array(C\NS_LIB . "tl", func_get_args());
+}
+/**
+ * shorthand for echo
+ *
+ * @param string $text string to send to the current output
+ */
+function e($text)
+{
+    echo $text;
+}
+/**
+ * Base controller class for all controllers on
+ * the SeekQuarry site.
+ *
+ * @author Chris Pollett
+ */
+abstract class Controller
+{
+    /**
+     * Array of instances of views  used by this controller
+     * @var array
+     */
+    public $view_instances = [];
+    /**
+     * Array of instances of models used by this controller
+     * @var array
+     */
+    public $model_instances;
+    /**
+     * Array of instances of indexing_plugins used by this controller
+     * @var array
+     */
+    public $plugin_instances;
+    /**
+     * Says which activities (roughly methods invoke from the web) this
+     * controller will respond to
+     * @var array
+     */
+    public $activities = [];
+    /**
+     * Associative array of activity => component activity is on, used
+     * by @see Controller::call method to actually invoke a given activity
+     * on a given component
+     * @var array
+     */
+    public $activity_component = [];
+    /**
+     * Associative array of $components activities for this controller
+     * Components are collections of activities (a little like traits) which
+     * can be reused.
+     *
+     * @var array
+     */
+    public static $component_activities = [];
+    /**
+     * Sets up component activities, instance array, and plugins.
+     *
+     * @param array $indexing_plugins which post processing indexing plugins
+     *      are available
+     */
+    public function __construct()
+    {
+        // this is to force a signout request to signout in all controllers
+        if (isset($_REQUEST['a']) && $_REQUEST['a'] == "signout") {
+            unset($_SESSION['USER_ID']);
+            unset($_REQUEST);
+            return $this->redirectWithMessage(
+                tl('search_controller_logout_successful'));
+        }
+        /* if no signout was made, get on with the business of building
+           this controller
+         */
+        mb_internal_encoding("UTF-8");
+        mb_regex_encoding("UTF-8");
+        $class = get_class($this);
+        foreach ($class::$component_activities as $component => $activities) {
+            foreach ($activities as $activity) {
+                $this->activity_component[$activity] = $component;
+                $this->activities[] = $activity;
+            }
+        }
+        $this->component_instances = [];
+        $this->plugins_instances = [];
+        $this->model_instances = [];
+        $this->view_instances = [];
+    }
+    /**
+     * This function should be overriden to web handle requests
+     */
+    public abstract function processRequest();
+    /**
+     * Dynamic loader for Component objects which might live on the current
+     * Component
+     *
+     * @param string $component name of model to return
+     */
+    public function component($component)
+    {
+        if (!isset($this->component_instances[$component])) {
+            $component_name = C\NS_COMPONENTS . ucfirst($component) .
+                "Component";
+            $this->component_instances[$component] =
+                new $component_name($this);
+        }
+        return $this->component_instances[$component];
+    }
+    /**
+     * Dynamic loader for Model objects which might live on the current
+     * Controller
+     *
+     * @param string $model name of model to return
+     */
+    public function model($model)
+    {
+        if (!isset($this->model_instances[$model])) {
+            $model_name = C\NS_MODELS . ucfirst($model)."Model";
+            $this->model_instances[$model] = new $model_name();
+        }
+        return $this->model_instances[$model];
+    }
+    /**
+     * Dynamic loader for Plugin objects which might live on the current
+     * Controller
+     *
+     * @param string $plugin name of Plugin to return
+     */
+    public function plugin($plugin)
+    {
+        if (!isset($this->plugin_instances[$plugin])) {
+            $plugin_name = C\NS_PLUGINS .
+                ucfirst($plugin)."Plugin";
+            $this->plugin_instances[$plugin] = new $plugin_name();
+        }
+        return $this->plugin_instances[$plugin];
+    }
+    /**
+     * Used to get a list of all available indexing plugins for this Yioop
+     * instance.
+     *
+     * @param array elts of which are names of indexing plugins
+     */
+    public function getIndexingPluginList()
+    {
+        $dir_prefixes = [C\BASE_DIR, C\APP_DIR];
+        foreach ($dir_prefixes as $dir_prefix) {
+            $plugin_dir = $dir_prefix."/library/indexing_plugins/";
+            $plugin_dir_len = strlen($plugin_dir);
+            $plugin_ext_len = strlen("Plugin.php");
+            foreach (glob("$plugin_dir*Plugin.php") as $filename) {
+                $tmp_plug_name = substr($filename, $plugin_dir_len,
+                    -$plugin_ext_len);
+                if ($tmp_plug_name != "Indexing") {
+                    $indexing_plugins[] = $tmp_plug_name;
+                }
+            }
+        }
+        return $indexing_plugins;
+    }
+    /**
+     * Dynamic loader for View objects which might live on the current
+     * Controller
+     *
+     * @param string $view name of view to return
+     */
+    public function view($view)
+    {
+        if (!isset($this->view_instances[$view])) {
+            $view_name = C\NS_VIEWS . ucfirst($view)."View";
+            $this->view_instances[$view] = new $view_name();
+        }
+        return $this->view_instances[$view];
+    }
+    /**
+     * Send the provided view to output, drawing it with the given
+     * data variable, using the current locale for translation, and
+     * writing mode
+     *
+     * @param string $view   the name of the view to draw
+     * @param array $data   an array of values to use in drawing the view
+     */
+    public function displayView($view, $data)
+    {
+        $data['LOCALE_TAG'] = L\getLocaleTag();
+        $data['LOCALE_DIR'] = L\getLocaleDirection();
+        $data['BLOCK_PROGRESSION'] = L\getBlockProgression();
+        $data['WRITING_MODE'] = L\getWritingMode();
+        if (C\QUERY_STATISTICS) {
+            $data['QUERY_STATISTICS'] = [];
+            $machine =  isset($_SERVER["HTTP_HOST"]) ?
+                htmlentities($_SERVER["HTTP_HOST"]) : "localhost";
+            $machine_uri = isset($_SERVER['REQUEST_URI']) ?
+                htmlentities($_SERVER['REQUEST_URI']) : "/";
+            $protocol = (isset($_SERVER["HTTPS"])) ? "https://" : "http://";
+            if ($machine == '::1') { //IPv6 :(
+                $machine = "[::1]/";
+                //used if the fetching and queue serving on the same machine
+            }
+            $data['YIOOP_INSTANCE'] = $protocol . $machine . $machine_uri;
+            $data['TOTAL_ELAPSED_TIME'] = 0;
+            foreach ($this->model_instances as $model_name => $model) {
+                $data['QUERY_STATISTICS'] = array_merge(
+                    $model->db->query_log,
+                    $data['QUERY_STATISTICS']
+                    );
+                $data['TOTAL_ELAPSED_TIME'] +=
+                    $model->db->total_time;
+            }
+            $locale_info = L\getLocaleQueryStatistics();
+            $data['QUERY_STATISTICS'] = array_merge(
+                    $locale_info['QUERY_LOG'],
+                    $data['QUERY_STATISTICS']
+                    );
+            $data['TOTAL_ELAPSED_TIME'] +=
+                    $locale_info['TOTAL_ELAPSED_TIME'];
+            $mail_total_time = AnalyticsManager::get("MAIL_TOTAL_TIME");
+            $mail_messages = AnalyticsManager::get("MAIL_MESSAGES");
+            if ($mail_total_time && $mail_messages) {
+                $data['QUERY_STATISTICS'] = array_merge($mail_messages,
+                    $data['QUERY_STATISTICS']
+                    );
+                $data['TOTAL_ELAPSED_TIME'] += $mail_total_time;
+            }
+        }
+        $data['c'] = isset($_REQUEST['c']) ? $_REQUEST['c'] : null;
+        if (isset($_SESSION['DISPLAY_MESSAGE'])) {
+            $data['DISPLAY_MESSAGE'] = $_SESSION['DISPLAY_MESSAGE'];
+            unset($_SESSION['DISPLAY_MESSAGE']);
+        }
+        $this->view($view)->render($data);
+    }
+    /**
+     * Does a 301 redirect to the given location, sets a session variable
+     * to display a message when get there.
+     *
+     * @param string $message message to write
+     * @param string $copy_fields $_REQUEST fields to copy for redirect
+     */
+    public function redirectWithMessage($message, $copy_fields = false)
+    {
+        $default_fields = ["c", "a", C\CSRF_TOKEN, "just_thread",
+            "just_group_id", "just_user_id", "group_id", "user_id", "role_id",
+            "limit", "num"];
+        if ($copy_fields) {
+            $copy_fields = array_merge($default_fields, $copy_fields);
+        } else {
+            $copy_fields = $default_fields;
+        }
+        $query_array = [];
+        foreach ($copy_fields as $field) {
+            if (isset($_REQUEST[$field])) {
+                if (is_array($_REQUEST[$field])) {
+                    $array_params_cleaned = $_REQUEST[$field];
+                    foreach ($array_params_cleaned as $key => $value) {
+                        $query_array[$field][$this->clean($key, "string")] =
+                            $this->clean($value, "string") ;
+                    }
+                } else {
+                    $query_array[$field] = $this->clean($_REQUEST[$field],
+                        "string");
+                }
+            }
+        }
+        if (isset($_REQUEST['route'])) {
+            foreach ($query_array as $field => $value) {
+                if (!empty($_REQUEST['route'][$field])) {
+                    unset($query_array[$field]);
+                }
+            }
+        }
+        $query_array = array_filter($query_array);
+        $location = ($query_array == []) ? C\BASE_URL :
+            "?" . http_build_query($query_array);
+        if ($message) {
+            $data['MESSAGE'] = $message;
+            $_SESSION['DISPLAY_MESSAGE'] = $message;
+        } else {
+            unset($_SESSION['DISPLAY_MESSAGE']);
+        }
+        if (php_sapi_name() == 'cli') {
+            //this case happens for configure_tool.php
+            return $data;
+        }
+        if (isset($_SERVER['HTTP_X_REQUESTED_WITH']) &&
+            $_SERVER['HTTP_X_REQUESTED_WITH'] == "XMLHttpRequest") {
+            e("go$location");
+        } else {
+            header("Location: $location");
+        }
+        exit();
+    }
+    /**
+     * When an activity involves displaying tabular data (such as rows of
+     * users, groups, etc), this method might be called to set up $data
+     * fields for next, prev, and page links, it also makes the call to the
+     * model to get the row data sorted and restricted as desired. For some
+     * data sources, rather than directly make a call to the model to get the
+     * data it might be passed directly to this method.
+     *
+     * @param array& $data used to send data to the view will be updated by
+     *     this method with row and paging data
+     * @param mixed $field_or_model if an object, this is assumed to be a model
+     *     and so the getRows method of this model is called to get row data,
+     *     sorted and restricted according to $search_array; if a string
+     *     then the row data is assumed to be in $data[$field_or_model] and
+     *     pagingLogic itself does the sorting and restricting.
+     * @param string $output_field output rows for the view will be stored in
+     *     $data[$output_field]
+     * @param int $default_show if not specified by $_REQUEST, then this will
+     *     be used to determine the maximum number of rows that will be
+     *     written to $data[$output_field]
+     * @param array $search_array used to sort and restrict in
+     *     the getRows call or the data from $data[$field_or_model].
+     *     Each element of this is a quadruple name of a field, what comparison
+     *     to perform, a value to check, and an order (ascending/descending)
+     *     to sort by
+     * @param string $var_prefix if there are multiple uses of pagingLogic
+     *     presented on the same view then $var_prefix can be prepended to
+     *     to the $data field variables like num_show, start_row, end_row
+     *     to distinguish between them
+     * @param array $args additional arguments that are passed to getRows and
+     *     in turn to selectCallback, fromCallback, and whereCallback that
+     *     might provide user_id, etc to further control which rows are
+     *     returned
+     */
+     public function pagingLogic(&$data, $field_or_model, $output_field,
+        $default_show, $search_array = [], $var_prefix = "", $args = null)
+     {
+        $data_fields = [];
+        $r = [];
+        $request_fields = ['num_show' => C\DEFAULT_ADMIN_PAGING_NUM,
+            'start_row' => 0, 'end_row' => C\DEFAULT_ADMIN_PAGING_NUM];
+        foreach ($request_fields as $field => $default) {
+            if (isset($_REQUEST[$var_prefix . $field])) {
+                $r[$field] = $_REQUEST[$var_prefix . $field];
+            } else {
+                $r[$field] = $default;
+            }
+        }
+        if ($r['start_row'] + $r['num_show'] != $r['end_row']) {
+            $r['end_row'] = $r['start_row'] + $r['num_show'];
+        }
+        $d = [];
+        $data_fields = ['NUM_TOTAL', 'NUM_SHOW', 'START_ROW', 'END_ROW',
+            'NEXT_START', 'NEXT_END', 'PREV_START', 'PREV_END'];
+        $var_field = strtoupper($var_prefix);
+        foreach ($data_fields as $field) {
+            $d[$field] = $var_prefix . $field;
+        }
+        $num_show = (isset($r['num_show']) &&
+            isset($this->view("admin")->helper("pagingtable")->show_choices[
+                $r['num_show']])) ? $r['num_show'] : $default_show;
+        $data[$d['NUM_SHOW']] = $num_show;
+        $data[$d['START_ROW']] = isset($r['start_row']) ?
+             max(0, $this->clean($r['start_row'],"int")) : 0;
+        if (is_object($field_or_model)) {
+            $data[$output_field] = $field_or_model->getRows(
+                $data[$d['START_ROW']], $num_show, $num_rows, $search_array,
+                $args);
+        } else {
+            $num_rows = count($data[$field_or_model]);
+            if ($search_array != []) {
+                $out_data = [];
+                foreach ($data[$field_or_model] as $name => $field_data) {
+                    $checks_passed = true;
+                    foreach ($search_array as $search_data) {
+                        list($column_name, $comparison, $search_value, $sort) =
+                            $search_data;
+                        if ($search_value == "") {continue; }
+                        if (isset($args[$column_name])) {
+                            $column_name = $args[$column_name];
+                        }
+                        $row_value = is_object($field_data) ?
+                            $field_data->$column_name:
+                            $field_data[$column_name];
+                        $cmp = strcmp($search_value, $row_value);
+                        if (($cmp == 0 && $comparison == "=") ||
+                            ($cmp != 0 && $comparison == "!=")
+                            ) {
+                            continue;
+                        }
+                        $pos = strpos($row_value, $search_value);
+                        $len_row = strlen($row_value);
+                        $len_search = strlen($search_value);
+                        if (($comparison == "CONTAINS" && $pos !== false) ||
+                            ($comparison == "BEGINS WITH" && $pos === 0) ||
+                            ($comparison == "ENDS WITH" && $pos === $len_row -
+                            $len_search)) {
+                            continue;
+                        }
+                        $checks_passed = false;
+                        break;
+                    }
+                    if ($checks_passed) {
+                        $out_data[$name] = $field_data;
+                    }
+                }
+                foreach ($search_array as $search_data) {
+                    list($column_name, $comparison, $search_value, $sort) =
+                        $search_data;
+                    if ($sort == "NONE") { continue; }
+                    if (isset($args[$column_name])) {
+                        $column_name = $args[$column_name];
+                    }
+                    $values = [];
+                    foreach ($out_data as $name => $field_data) {
+                        $values[$name] = is_object($field_data) ?
+                            $field_data->$column_name:
+                            $field_data[$column_name];
+                    }
+                    $sort = ($sort=="DESC") ? SORT_DESC: SORT_ASC;
+                    array_multisort($values, $sort, $out_data);
+                }
+            } else {
+                $out_data = $data[$field_or_model];
+            }
+            $data[$output_field] = array_slice($out_data,
+                $data[$d['START_ROW']], $num_show);
+        }
+        $data[$d['START_ROW']] = min($data[$d['START_ROW']], $num_rows);
+        $data[$d['END_ROW']] = min($data[$d['START_ROW']] + $num_show,
+            $num_rows);
+        if (isset($r['start_row'])) {
+            $data[$d['END_ROW']] = max($data[$d['START_ROW']],
+                    min($this->clean($r['end_row'],"int"), $num_rows));
+        }
+        $data[$d['NEXT_START']] = $data[$d['END_ROW']];
+        $data[$d['NEXT_END']] = min($data[$d['NEXT_START']] + $num_show,
+            $num_rows);
+        $data[$d['PREV_START']] = max(0, $data[$d['START_ROW']] - $num_show);
+        $data[$d['PREV_END']] = $data[$d['START_ROW']];
+        $data[$d['NUM_TOTAL']] = $num_rows;
+     }
+    /**
+     * Used to invoke an activity method of the current controller or one
+     * its components
+     *
+     * @param $activity method to invoke
+     */
+     public function call($activity)
+     {
+        if (isset($this->activity_component[$activity])) {
+            return $this->component(
+                $this->activity_component[$activity])->$activity();
+        }
+        return $this->$activity();
+     }
+    /**
+     * Generates a cross site request forgery preventing token based on the
+     * provided user name, the current time and the hidden AUTH_KEY
+     *
+     * @param string $user   username to use to generate token
+     * @return string   a csrf token
+     */
+    public function generateCSRFToken($user)
+    {
+        $time = time();
+        $_SESSION['OLD_CSRF_TIME'] = (isset($_SESSION['CSRF_TIME'])) ?
+            $_SESSION['CSRF_TIME'] : 0;
+        $_SESSION['CSRF_TIME'] = $time;
+        return L\crawlHash($user.$time . C\AUTH_KEY)."*$time";
+    }
+    /**
+     * Checks if the form CSRF (cross-site request forgery preventing) token
+     * matches the given user and has not expired (1 hour till expires)
+     *
+     * @param string $token_name attribute of $_REQUEST containing CSRFToken
+     * @param string $user  user id
+     * @return bool  whether the CSRF token was valid
+     */
+    public function checkCSRFToken($token_name, $user)
+    {
+        $token_okay = false;
+        if (isset($_REQUEST[$token_name]) &&
+            strlen($_REQUEST[$token_name]) == 22) {
+            $token_parts = explode("*", $_REQUEST[$token_name]);
+            if (isset($token_parts[1]) &&
+                $token_parts[1] + C\ONE_HOUR > time() &&
+                L\crawlHash($user . $token_parts[1] . C\AUTH_KEY) ==
+                $token_parts[0]) {
+                $token_okay = true;
+            }
+        }
+        return $token_okay;
+    }
+    /**
+     * Checks if the timestamp in $_REQUEST[$token_name]
+     * matches the timestamp of the last CSRF token accessed by this user
+     * for the kind of activity for which there might be a conflict.
+     * This is to avoid accidental replays of postings etc if the back button
+     * used.
+     *
+     * @param string $token_name name of a $_REQUEST field used to hold a
+     *     CSRF_TOKEN
+     * @param string $action name of current action to check for conflicts
+     * @return bool whether a conflicting action has occurred.
+     */
+     public function checkCSRFTime($token_name, $action = "")
+     {
+        $token_okay = false;
+        if (isset($_REQUEST[$token_name])) {
+            $token_parts = explode("*", $_REQUEST[$token_name]);
+            if (isset($token_parts[1])) {
+                $timestamp_to_check = $token_parts[1];
+                if ($action == "") {
+                    if (isset($_SESSION['OLD_CSRF_TIME']) &&
+                        $token_parts[1] == $_SESSION['OLD_CSRF_TIME']) {
+                        $token_okay = true;
+                    }
+                } else {
+                    if (!isset($_SESSION['OLD_ACTION_STAMPS'][$action]) ||
+                        (isset($_SESSION['OLD_ACTION_STAMPS'][$action]) &&
+                        $_SESSION['OLD_ACTION_STAMPS'][$action] <=
+                            $timestamp_to_check)) {
+                        $_SESSION['OLD_ACTION_STAMPS'][$action] =
+                            $timestamp_to_check;
+                        $token_okay = true;
+                        $cull_time = time() - C\ONE_HOUR;
+                        foreach ($_SESSION['OLD_ACTION_STAMPS'] as $act =>
+                            $time) {
+                            if ($time < $cull_time) {
+                                unset($_SESSION['OLD_ACTION_STAMPS'][$act]);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        return $token_okay;
+     }
+    /**
+     * Used to clean strings that might be tainted as originate from the user
+     *
+     * @param mixed $value tainted data
+     * @param mixed $type type of data in value can be one of the following
+     *      strings: bool, color, double, float, int, hash, or string, web-url;
+     *      or it can be an array listing allowed values. If the latter, then
+     *      if the value is not in the array the cleaned value will be first
+     *      element of the array if $default is null
+     * @param mixed $default if $value is not set default value is returned,
+     *     this isn't used much since if the error_reporting is E_ALL
+     *     or -1 you would still get a Notice.
+     * @return string the clean input matching the type provided
+     */
+    public function clean($value, $type, $default = null)
+    {
+        $clean_value = null;
+        if (is_array($type)) {
+            if (in_array($value, $type)) {
+                return $value;
+            } else {
+                if ($default != null) {
+                    return $default;
+                }
+                reset($type);
+                return current($type);
+            }
+        }
+        switch ($type) {
+            case "boolean":
+                // no break
+            case "bool":
+                if (isset($value)) {
+                    if (!is_bool($value)) {
+                        $clean_value = false;
+                        if ($value == "true" || $value != 0) {
+                            $clean_value = true;
+                        }
+                    }
+                } else if ($default != null) {
+                    $clean_value = $default;
+                } else {
+                    $clean_value = false;
+                }
+                break;
+            case "color":
+                if (isset($value)) {
+                    $colors = ["black", "silver", "gray", "white",
+                        "maroon", "red", "purple", "fuchsia", "green", "lime",
+                        "olive", "yellow", "navy", "blue", "teal", "aqua",
+                        "orange", "aliceblue", "antiquewhite", "aquamarine",
+                        "azure", "beige", "bisque", "blanchedalmond",
+                        "blueviolet", "brown", "burlywood", "cadetblue",
+                        "chartreuse", "chocolate", "coral", "cornflowerblue",
+                        "cornsilk", "crimson", "darkblue", "darkcyan",
+                        "darkgoldenrod", "darkgray", "darkgreen", "darkgrey",
+                        "darkkhaki", "darkmagenta", "darkolivegreen",
+                        "darkorange", "darkorchid", "darkred", "darksalmon",
+                        "darkseagreen", "darkslateblue", "darkslategray",
+                        "darkslategrey", "darkturquoise", "darkviolet",
+                        "deeppink", "deepskyblue", "dimgray", "dodgerblue",
+                        "firebrick", "floralwhite", "forestgreen", "gainsboro",
+                        "ghostwhite", "gold", "goldenrod", "greenyellow",
+                        "grey", "honeydew", "hotpink", "indianred", "indigo",
+                        "ivory", "khaki", "lavender", "lavenderblush",
+                        "lawngreen", "lemonchiffon", "lightblue", "lightcoral",
+                        "lightcyan", "lightgoldenrodyellow", "lightgray",
+                        "lightgreen", "lightgrey", "lightpink", "lightsalmon",
+                        "lightseagreen", "lightskyblue", "lightslategray",
+                        "lightslategrey", "lightsteelblue", "lightyellow",
+                        "limegreen", "linen", "mediumaquamarine",
+                        "mediumblue", "mediumorchid", "mediumpurple",
+                        "mediumseagreen", "mediumslateblue",
+                        "mediumspringgreen", "mediumturquoise",
+                        "mediumvioletred", "midnightblue", "mintcream",
+                        "mistyrose", "moccasin", "navajowhite", "oldlace",
+                        "olivedrab", "orangered", "orchid", "palegoldenrod",
+                        "palegreen", "paleturquoise", "palevioletred",
+                        "papayawhip", "peachpuff", "peru", "pink", "plum",
+                        "powderblue", "rosybrown", "royalblue", "saddlebrown",
+                        "salmon", "sandybrown", "seagreen", "seashell",
+                        "sienna",  "skyblue", "slateblue", "slategray",
+                        "slategrey", "snow", "springgreen", "steelblue",
+                        "tan", "thistle", "tomato", "turquoise", "violet",
+                        "wheat", "whitesmoke", "yellowgreen", "rebeccapurple"
+                    ];
+                    if (in_array($value, $colors)
+                        || preg_match('/^#[a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]'.
+                        '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9])?$/',
+                            trim($value))) {
+                        $clean_value = trim($value);
+                    } else {
+                        $clean_value = "#FFF";
+                    }
+                } else if ($default != null) {
+                    $clean_value = $default;
+                } else {
+                    $clean_value = "#FFF";
+                }
+                break;
+            case "double":
+                if (isset($value)) {
+                    $clean_value = doubleval($value);
+                } else if ($default != null) {
+                    $clean_value = $default;
+                } else {
+                    $clean_value = 0;
+                }
+                break;
+            case "float":
+                if (isset($value)) {
+                    $clean_value = floatval($value);
+                } else if ($default != null) {
+                    $clean_value = $default;
+                } else {
+                    $clean_value = 0;
+                }
+                break;
+            case "file_name":
+                if (isset($value)) {
+                    $value = str_replace("&amp;", "&", $value);
+                    $value = str_replace("/", "", $value);
+                    $value = str_replace("\\", "", $value);
+                    $value = str_replace("*", "", $value);
+                    $clean_value = str_replace(":", "", $value);
+                } else {
+                    $clean_value = $default;
+                }
+                break;
+            case "hash";
+                if (isset($value)) {
+                    if (strlen($value) == strlen(L\crawlHash("A")) &&
+                        base64_decode($value)) {
+                        $clean_value = $value;
+                    }
+                } else {
+                    $clean_value = $default;
+                }
+                break;
+            case "int":
+                if (isset($value)) {
+                    $clean_value = intval($value);
+                } else if ($default != null) {
+                    $clean_value = $default;
+                } else {
+                    $clean_value = 0;
+                }
+                break;
+            case "string":
+                if (isset($value)) {
+                    $value2 = str_replace("&amp;", "&", $value);
+                    // -CP REMEMBER TO CK THIS!!!!!
+                    $value2 = mb_convert_encoding($value2, "UTF-8");
+                    $clean_value = @htmlentities($value2, ENT_QUOTES, "UTF-8");
+                } else {
+                    $clean_value = $default;
+                }
+                break;
+            case 'web-url':
+                if (isset($value)) {
+                    $value = trim($value);
+                    $start = substr($value, 0, 4);
+                    $is_web = ($start == 'http');
+                    if (!$is_web && $start != "goph") {
+                        $value = "http://$value";
+                        $is_web = true;
+                    }
+                    if ($is_web) {
+                        $value = str_replace("&amp;", "&", $value);
+                        $clean_value = @htmlentities($value, ENT_QUOTES,
+                            "UTF-8");
+                    }
+                    $clean_value = UrlParser::canonicalLink($clean_value,
+                        $default, false);
+                } else {
+                    $clean_value = $default;
+                }
+                break;
+        }
+        return $clean_value;
+    }
+    /**
+     * Converts an array of lines of strings into a single string with
+     * proper newlines, each line having been trimmed and potentially
+     * cleaned
+     *
+     * @param array $arr the array of lines to be process
+     * @param string $endline_string what string should be used to indicate
+     *     the end of a line
+     * @param bool $clean whether to clean each line
+     * @return string a concatenated string of cleaned lines
+     */
+    public function convertArrayLines($arr, $endline_string="\n",
+        $clean = false)
+    {
+        $output = "";
+        $eol = "";
+        foreach ($arr as $line) {
+            $output .= $eol;
+            $out_line = trim($line);
+            if ($clean) {
+                $out_line = $this->clean($out_line, "string");
+            }
+            $output .= trim($out_line);
+            $eol = $endline_string;
+        }
+        return $output;
+    }
+    /**
+     * Cleans a string consisting of lines, typically of urls into an array of
+     * clean lines. This is used in handling data from the crawl options
+     * text areas. # is treated as a comment
+     *
+     * @param string $str contains the url data
+     * @param string $line_type does additional cleaning depending on the type
+     *     of the lines. For instance, if is "url" then a line not beginning
+     *     with a url scheme will have http:// prepended.
+     * @return $lines an array of clean lines
+     */
+    public function convertStringCleanArray($str, $line_type="url")
+    {
+        $pre_lines = preg_split('/\n+/', $str);
+        $lines = [];
+        foreach ($pre_lines as $line) {
+            $pre_line = trim($this->clean($line, "string"));
+            if (strlen($pre_line) > 0) {
+                if ($line_type == "url") {
+                    $start_line = substr($pre_line, 0, 6);
+                    if (!in_array($start_line,
+                        ["file:/", "http:/", "domain", "https:", 'gopher']) &&
+                        $start_line[0] != "#") {
+                        $pre_line = "http://". $pre_line;
+                    }
+                }
+                $lines[] = $pre_line;
+            }
+        }
+        return $lines;
+    }
+    /**
+     * Checks the request if a request is for a valid activity and if it uses
+     * the correct authorization key
+     *
+     * @return bool whether the request was valid or not
+     */
+    public function checkRequest()
+    {
+        if (empty($_REQUEST['time']) ||
+            empty($_REQUEST['session']) || (
+            !in_array($_REQUEST['a'], $this->activities)
+            && $_REQUEST['c'] != 'jobs' )) { return false; }
+        $time = $_REQUEST['time'];
+            // request must be within an hour of this machine's clock
+        if (abs(time() - $time) > C\ONE_HOUR) { return false;}
+        $session = $_REQUEST['session'];
+        if (md5($time . C\AUTH_KEY) != $session) { return false; }
+        return true;
+    }
+    /**
+     * Used to set up the head variables for and page_data of a wiki or static
+     * page associated with a view.
+     *
+     * @param object $view View on which page data will be rendered
+     * @param string $page_name a string name/id to associate with page. For
+     *     example, might have 404 for a page about 404 errors
+     * @param string $page_data this is the actual content of a wiki or
+     *      static page
+     */
+    public function parsePageHeadVarsView($view, $page_name, $page_data)
+    {
+        list($view->head_objects[$page_name], $view->page_objects[$page_name])=
+            $this->parsePageHeadVars($page_data, true);
+    }
+    /**
+     * Used to parse head meta variables out of a data string provided either
+     * from a wiki page or a static page. Meta data is stored in lines
+     * before the first occurrence of END_HEAD_VARS. Head variables
+     * are name=value pairs. An example of head
+     * variable might be:
+     * title = This web page's title
+     * Anything after a semi-colon on a line in the head section is treated as
+     * a comment
+     *
+     * @param string $page_data this is the actual content of a wiki or
+     *      static page
+     * @param bool whether to output just an array of head variables or
+     *      if output a pair [head vars, page body]
+     * @return array the associative array of head variables or pair
+     *      [head vars, page body]
+     */
+    public function parsePageHeadVars($page_data, $with_body = false)
+    {
+        $page_parts = explode("END_HEAD_VARS", $page_data);
+        $head_object = [];
+        if (count($page_parts) > 1) {
+            $head_lines = preg_split("/\n\n/", array_shift($page_parts));
+            $page_data = implode("END_HEAD_VARS", $page_parts);
+            foreach ($head_lines as $line) {
+                $semi_pos =  (strpos($line, ";")) ? strpos($line, ";"):
+                    strlen($line);
+                $line = substr($line, 0, $semi_pos);
+                $line_parts = explode("=", $line);
+                if (count($line_parts) == 2) {
+                    $head_object[trim(urldecode($line_parts[0]))] =
+                        urldecode(trim($line_parts[1]));
+                }
+            }
+        } else {
+            $page_data = $page_parts[0];
+        }
+        if ($with_body) {
+            return [$head_object, $page_data];
+        }
+        return $head_object;
+    }
+    /**
+     * If external source advertisements are present in the output of this
+     * controller this function can be used to initialize the field variables
+     * used to write the appropriate Javascripts
+     *
+     * @param array& $data data to be used in drawing the view
+     * @param bool $ads_off whether or not ads are turned off so that this
+     *      method should do nothing
+     */
+     public function initializeAdFields(&$data, $ads_off = false)
+     {
+        if (C\AD_LOCATION != "none") {
+            $data["AD_LOCATION"] = ($ads_off) ? "none" : C\AD_LOCATION;
+            $ad_fields = ['TOP_ADSCRIPT', 'SIDE_ADSCRIPT', 'GLOBAL_ADSCRIPT'];
+            foreach ($ad_fields as $ad_field) {
+                $ad = html_entity_decode(constant(C\NS_CONFIGS . $ad_field),
+                    ENT_QUOTES);
+                $ad = preg_replace("[&#40;]","(",$ad);
+                $data[$ad_field] = preg_replace("[&#41;]",")",$ad);
+            }
+        }
+     }
+    /**
+     * Adds to an integer, $actual_value, epsilon-noise taken from an
+     * L_1 gaussian source to centered at $actual_value to get a epsilon
+     * private, integre  value.
+     *
+     * @param int $actual_value number want to make private
+     * @return int $fuzzy_value number after noise added
+     */
+    public function addDifferentialPrivacy($actual_value)
+    {
+        $sigma = 1 / C\PRIVACY_EPSILON;
+        $max_value = (2 * $actual_value) + 1;
+        /* Calculation by Integration
+         * f: exp(-1 * ((abs($actual_value - $x)) / $sigma))
+         * Since function consists of absolute value, break the integration
+         * into two to remove absolute from the function.
+         * First integral runs from 0 through $actual_value, and second
+         * integral runs from $actual_value through $max_value.
+         * Then after using substition rule, first and second integral
+         * range change, say [a,b] and [b,c]
+         */
+        $a = -1 * $actual_value;
+        $b = 0;
+        $c = -1 * ($max_value - $actual_value);
+        $integral_value = $sigma * (2 - exp($a / $sigma) - exp($c / $sigma));
+        $random = rand(0, $integral_value);
+        $p = 0;
+        for ($i = 0; $i < $max_value; $i++) {
+            $arg = -1 * (($actual_value - $i) / $sigma);
+            $p +=  exp($arg);
+            if ($p > $random) {
+                break;
+            }
+        }
+        $fuzzy_value = $i;
+        return $fuzzy_value;
+    }
+}
--- a/src/controllers/CrawlController.php
+++ b/src/controllers/CrawlController.php
@ -0,0 +1,328 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\MediaConstants;
+use seekquarry\yioop\library\UrlParser;
+
+/**
+ * Controller used to manage networked installations of Yioop where
+ * there might be mutliple queue_servers and a name_server. Command
+ * sent to the nameserver web page are mapped out to queue_servers
+ * using this controller. Each method of the controller essentially
+ * mimics one method of CrawlModel, PhraseModel, or in general anything
+ * that extends ParallelModel and is used to proxy that information
+ * through a result web page back to the name_server.
+ *
+ * @author Chris Pollett
+ */
+class CrawlController extends Controller implements CrawlConstants
+{
+    /**
+     * These are the activities supported by this controller
+     * @var array
+     */
+    public $activities = ["countWords","clearQuerySavePoint",
+        "crawlStalled", "crawlStatus", "deleteCrawl", "injectUrlsCurrentCrawl",
+        "combinedCrawlInfo", "getInfoTimestamp", "getCrawlItems",
+        "getCrawlList", "getCrawlSeedInfo", "sendStartCrawlMessage",
+        "sendStopCrawlMessage", "setCrawlSeedInfo",
+        ];
+    /**
+     * Checks that the request seems to be coming from a legitimate fetcher then
+     * determines which activity the fetcher is requesting and calls that
+     * activity for processing.
+     */
+    public function processRequest()
+    {
+        $data = [];
+        /* do a quick test to see if this is a request seems like
+           from a legitimate machine
+         */
+        if (!$this->checkRequest()) {return; }
+        $activity = $_REQUEST['a'];
+        if (in_array($activity, $this->activities)) {
+            $this->call($activity);
+        }
+    }
+    /**
+     * Handles a request for whether or not the crawl is stalled on the
+     * given local server (which means no fetcher has spoken to it in a while)
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
+    public function crawlStalled()
+    {
+        echo L\webencode(serialize($this->model("crawl")->crawlStalled()));
+    }
+    /**
+     * Handles a request for the crawl status (memory use, recent fetchers
+     * crawl rate, etc) data from a remote name server
+     * and retrieves that the statistic about this that are held by the
+     * local queue server
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
+    public function crawlStatus()
+    {
+        echo L\webencode(serialize($this->model("crawl")->crawlStatus()));
+    }
+    /**
+     * Handles a request for the starting parameters of a crawl of a given
+     * timestamp and retrieves that information from the bundle held by the
+     * local queue server
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
+    public function getCrawlSeedInfo()
+    {
+        $timestamp = 0;
+        if (isset($_REQUEST["arg"]) ) {
+            $timestamp = unserialize(L\webdecode($_REQUEST["arg"]));
+            $timestamp = substr($this->clean($timestamp, "int"), 0,
+                C\TIMESTAMP_LEN);
+        }
+        echo L\webencode(serialize($this->model("crawl")->getCrawlSeedInfo(
+            $timestamp)));
+    }
+    /**
+     * Handles a request to change the parameters of a crawl of a given
+     * timestamp on the local machine (does nothing if crawl doesn't exist)
+     */
+    public function setCrawlSeedInfo()
+    {
+        if (isset($_REQUEST["arg"]) ) {
+            list($timestamp, $info) = unserialize(
+                L\webdecode($_REQUEST["arg"]));
+            $timestamp = substr($this->clean($timestamp, "int"), 0,
+                C\TIMESTAMP_LEN);
+            if ($timestamp && $info) {
+                $this->model("crawl")->setCrawlSeedInfo($timestamp, $info);
+            }
+        }
+    }
+    /**
+     * Handles a request for information about a crawl with a given timestamp
+     * from a remote name server and retrieves statistics about this crawl
+     * that are held by the local queue server (number of pages, name, etc)
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
+    public function getInfoTimestamp()
+    {
+        $timestamp = 0;
+        if (isset($_REQUEST["arg"]) ) {
+            $timestamp = unserialize(L\webdecode($_REQUEST["arg"]));
+            $timestamp = substr($this->clean($timestamp, "int"), 0,
+                C\TIMESTAMP_LEN);
+        }
+        echo L\webencode(serialize($this->model("crawl")->getInfoTimestamp(
+            $timestamp)));
+    }
+    /**
+     * Handles a request for the crawl list (what crawl are stored on the
+     * machine) data from a remote name server and retrieves the
+     * statistic about this that are held by the local queue server
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
+    public function getCrawlList()
+    {
+        $return_arc_bundles = false;
+        $return_recrawls = false;
+        if (isset($_REQUEST["arg"]) ) {
+            $arg = trim(L\webdecode($_REQUEST["arg"]));
+            $arg = $this->clean($arg, "int");
+            if ($arg == 3 || $arg == 1) {$return_arc_bundles = true; }
+            if ($arg == 3 || $arg == 2) {$return_recrawls = true; }
+        }
+        echo L\webencode(serialize($this->model("crawl")->getCrawlList(
+            $return_arc_bundles, $return_recrawls)));
+    }
+    /**
+     * Handles a request for the combined crawl list, stalled, and status
+     * data from a remote name server and retrieves that the statistic about
+     * this that are held by the local queue server
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
+    public function combinedCrawlInfo()
+    {
+        $combined =  $this->model("crawl")->combinedCrawlInfo();
+        echo L\webencode(serialize($combined));
+    }
+    /**
+     * Receives a request to delete a crawl from a remote name server
+     * and then deletes crawl on the local queue server
+     */
+    public function deleteCrawl()
+    {
+        if (!isset($_REQUEST["arg"]) ) {
+            return;
+        }
+        $timestamp = substr($this->clean($timestamp, "int"), 0,
+            C\TIMESTAMP_LEN);
+        $timestamp = unserialize(L\webdecode($_REQUEST["arg"]));
+        $this->model("crawl")->deleteCrawl($timestamp);
+    }
+
+    /**
+     * Receives a request to inject new urls into the active
+     * crawl from a remote name server and then does this for
+     * the local queue server
+     */
+    public function injectUrlsCurrentCrawl()
+    {
+        if (!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"])
+            || !isset($_REQUEST["i"])) {
+            return;
+        }
+        $num = $this->clean($_REQUEST["num"], "int");
+        $i = $this->clean($_REQUEST["i"], "int");
+        list($timestamp, $inject_urls) =
+            unserialize(L\webdecode($_REQUEST["arg"]));
+        $timestamp = substr($this->clean($timestamp, "int"), 0,
+            C\TIMESTAMP_LEN);
+        $inject_urls = L\partitionByHash($inject_urls,
+            null, $num, $i, C\NS_LIB . "UrlParser::getHost");
+        $this->model("crawl")->injectUrlsCurrentCrawl($timestamp,
+            $inject_urls, null);
+    }
+    /**
+     * Receives a request to get crawl summary data for an array of urls
+     * from a remote name server and then looks these up on the local
+     * queue server
+     */
+     public function getCrawlItems()
+     {
+        $crawl_model = $this->model("crawl");
+        $start_time = microtime(true);
+        if (!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"])
+            || !isset($_REQUEST["i"])) {
+            return;
+        }
+        $num = $this->clean($_REQUEST["num"], "int");
+        $i = $this->clean($_REQUEST["i"], "int");
+        $crawl_model->current_machine = $i;
+        $lookups = unserialize(L\webdecode($_REQUEST["arg"]));
+        $our_lookups = [];
+        foreach ($lookups as $lookup => $lookup_info) {
+            if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h'
+                || $lookup_info[0][0] === 'r'
+                || $lookup_info[0][0] === 'g')) {
+                $our_lookups[$lookup] = $lookup_info;
+            } else {
+                $our_lookups[$lookup] = [];
+                foreach ($lookup_info as $lookup_item) {
+                    if (count($lookup_item) == 2) {
+                        $our_lookups[$lookup][] = $lookup_item;
+                    } else {
+                        list($index, , , , ) = $lookup_item;
+                        if ($index == $i) {
+                            $our_lookups[$lookup][] = $lookup_item;
+                        }
+                    }
+                }
+            }
+        }
+        $items = $crawl_model->getCrawlItems($our_lookups);
+        $items["ELAPSED_TIME"] = L\changeInMicrotime($start_time);
+        header("Content-Type: text/plain");
+        $items = L\webencode(serialize($items));
+        header("Content-Length: ".strlen($items));
+        echo $items;
+        flush();
+     }
+    /**
+     * Receives a request to get counts of the number of occurrences of an
+     * array of words a remote name server and then
+     * determines and outputs these counts for the local queue server
+     */
+     public function countWords()
+     {
+        if (!isset($_REQUEST["arg"]) ) {
+            return;
+        }
+        $crawl_model = $this->model("crawl");
+        list($words, $index_name) = unserialize(L\webdecode($_REQUEST["arg"]));
+        $crawl_model->index_name = $index_name;
+        echo L\webencode(serialize(
+            $crawl_model->countWords($words)));
+     }
+    /**
+     * Receives a request to stop a crawl from a remote name server
+     * and then stop the current crawl on the local queue server
+     */
+    public function sendStopCrawlMessage()
+    {
+        $this->model("crawl")->sendStopCrawlMessage();
+    }
+    /**
+     * Receives a request to start a crawl from a remote name server
+     * and then starts the crawl process on the local queue server
+     */
+    public function sendStartCrawlMessage()
+    {
+        if (!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"])
+            || !isset($_REQUEST["i"])) {
+            return;
+        }
+        $num = $this->clean($_REQUEST["num"], "int");
+        $i = $this->clean($_REQUEST["i"], "int");
+        list($crawl_params,
+            $seed_info) = unserialize(L\webdecode($_REQUEST["arg"]));
+        $seed_info['seed_sites']['url'] =
+            L\partitionByHash($seed_info['seed_sites']['url'],
+            null, $num, $i, C\NS_LIB . "UrlParser::getHost");
+        $this->model("crawl")->sendStartCrawlMessage($crawl_params, $seed_info,
+            null);
+    }
+    /**
+     * A save point is used to store to disk a sequence generation-doc-offset
+     * pairs of a particular mix query when doing an archive crawl of a crawl
+     * mix. This is used so that the mix can remember where it was the next
+     * time it is invoked by the web app on the machine in question.
+     * This function deletes such a save point associated with a timestamp
+     */
+    public function clearQuerySavePoint()
+    {
+        if (!isset($_REQUEST["arg"])) {
+            return;
+        }
+        $save_timestamp = substr($this->clean($_REQUEST["arg"], "int"), 0,
+            C\TIMESTAMP_LEN);
+        $this->model("crawl")->clearQuerySavePoint($save_timestamp);
+    }
+}
+
--- a/src/controllers/FetchController.php
+++ b/src/controllers/FetchController.php
@ -0,0 +1,643 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\classifiers\Classifier;
+
+// to allow the calulation of longer archive schedules
+ini_set('max_execution_time', 60);
+/**
+ * This class handles data coming to a queue_server from a fetcher
+ * Basically, it receives the data from the fetcher and saves it into
+ * various files for later processing by the queue server.
+ * This class can also be used by a fetcher to get status information.
+ *
+ * @author Chris Pollett
+ */
+class FetchController extends Controller implements CrawlConstants
+{
+    /**
+     * These are the activities supported by this controller
+     * @var array
+     */
+    public $activities = ["schedule", "archiveSchedule", "update", "crawlTime"];
+    /**
+     * Number of seconds that must elapse after last call before doing
+     * cron activities (mainly check liveness of fetchers which should be
+     * alive)
+     */
+    const CRON_INTERVAL = 300;
+    /**
+     * Checks that the request seems to be coming from a legitimate fetcher then
+     * determines which activity the fetcher is requesting and calls that
+     * activity for processing.
+     */
+    public function processRequest()
+    {
+        $data = [];
+        /* do a quick test to see if this is a request seems like
+           from a legitimate machine
+         */
+        if (!$this->checkRequest()) {return; }
+        $activity = $_REQUEST['a'];
+        $robot_table_name = C\CRAWL_DIR."/".self::robot_table_name;
+        $robot_table = [];
+        if (file_exists($robot_table_name)) {
+            $robot_table = unserialize(file_get_contents($robot_table_name));
+        }
+        if (isset($_REQUEST['robot_instance']) &&
+            (isset($_REQUEST['machine_uri']))) {
+            $robot_table[$this->clean($_REQUEST['robot_instance'], "string")] =
+                [$_SERVER['REMOTE_ADDR'],
+                $this->clean($_REQUEST['machine_uri'], "string"),
+                time()];
+            file_put_contents($robot_table_name, serialize($robot_table),
+                LOCK_EX);
+        }
+        if (in_array($activity, $this->activities)) {
+            $this->call($activity);
+        }
+    }
+    /**
+     * Checks if there is a schedule of sites to crawl available and
+     * if so present it to the requesting fetcher, and then delete it.
+     */
+    public function schedule()
+    {
+        $view = "fetch";
+        // set up query
+        $data = [];
+        if (isset($_REQUEST['crawl_time'])) {;
+            $crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'),
+                0, C\TIMESTAMP_LEN);
+        } else {
+            $crawl_time = 0;
+        }
+        $schedule_filename = C\CRAWL_DIR."/schedules/".
+            self::schedule_name."$crawl_time.txt";
+        if (file_exists($schedule_filename)) {
+            $data['MESSAGE'] = file_get_contents($schedule_filename);
+            unlink($schedule_filename);
+        } else {
+            /*  check if scheduler part of queue server went down
+                and needs to be restarted with current crawl time.
+                Idea is fetcher has recently spoken with name server
+                so knows the crawl time. queue server knows time
+                only by file messages never by making curl requests
+             */
+            $this->checkRestart(self::WEB_CRAWL);
+            $info = [];
+            $info[self::STATUS] = self::NO_DATA_STATE;
+            $data['MESSAGE'] = base64_encode(serialize($info))."\n";
+        }
+        $this->displayView($view, $data);
+    }
+    /**
+     * Checks to see whether there are more pages to extract from the current
+     * archive, and if so returns the next batch to the requesting fetcher. The
+     * iteration progress is automatically saved on each call to nextPages, so
+     * that the next fetcher will get the next batch of pages. If there is no
+     * current archive to iterate over, or the iterator has reached the end of
+     * the archive then indicate that there is no more data by setting the
+     * status to NO_DATA_STATE.
+     */
+    public function archiveSchedule()
+    {
+        $view = "fetch";
+        $request_start = time();
+        if (isset($_REQUEST['crawl_time'])) {;
+            $crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'),
+                0, C\TIMESTAMP_LEN);
+        } else {
+            $crawl_time = 0;
+        }
+        $messages_filename = C\CRAWL_DIR.'/schedules/NameServerMessages.txt';
+        $lock_filename = C\WORK_DIRECTORY."/schedules/NameServerLock.txt";
+        if ($crawl_time > 0 && file_exists($messages_filename)) {
+            $fetch_pages = true;
+            $info = unserialize(file_get_contents($messages_filename));
+            if ($info[self::STATUS] == 'STOP_CRAWL') {
+                /* The stop crawl message gets created by the admin_controller
+                   when the "stop crawl" button is pressed.*/
+                if (file_exists($messages_filename)) {
+                    unlink($messages_filename);
+                }
+                if (file_exists($lock_filename)) {
+                    unlink($lock_filename);
+                }
+                $fetch_pages = false;
+                $info = [];
+            }
+            $this->checkRestart(self::ARCHIVE_CRAWL);
+        } else {
+            $fetch_pages = false;
+            $info = [];
+        }
+        $pages = [];
+        $got_lock = true;
+        if (file_exists($lock_filename)) {
+            $lock_time = unserialize(file_get_contents($lock_filename));
+            if ($request_start - $lock_time < ini_get('max_execution_time')){
+                $got_lock = false;
+            }
+        }
+        $chunk = false;
+        $archive_iterator = null;
+        if ($fetch_pages && $got_lock) {
+            file_put_contents($lock_filename, serialize($request_start));
+            if ($info[self::ARC_DIR] == "MIX" ||
+                    file_exists($info[self::ARC_DIR])) {
+                $iterate_timestamp = $info[self::CRAWL_INDEX];
+                $result_timestamp = $crawl_time;
+                $result_dir = C\WORK_DIRECTORY.
+                    "/schedules/".self::name_archive_iterator.$crawl_time;
+                $arctype = $info[self::ARC_TYPE];
+                $iterator_name = C\NS_ARCHIVE . $arctype."Iterator";
+                try {
+                    if ($info[self::ARC_DIR] == "MIX") {
+                        //recrawl of crawl mix case
+                        $archive_iterator = new $iterator_name(
+                            $iterate_timestamp, $result_timestamp);
+                    } else {
+                        //any other archive crawl except web archive recrawls
+                        $archive_iterator = new $iterator_name(
+                            $iterate_timestamp, $info[self::ARC_DIR],
+                            $result_timestamp, $result_dir);
+                    }
+                } catch (\Exception $e) {
+                    $info['ARCHIVE_BUNDLE_ERROR'] =
+                        "Invalid bundle iterator: '{$iterator_name}' \n".
+                        $e->getMessage();
+                }
+            }
+            $pages = false;
+            if ($archive_iterator && !$archive_iterator->end_of_iterator) {
+                if (L\generalIsA($archive_iterator,
+                    C\NS_ARCHIVE . "TextArchiveBundleIterator")) {
+                    $pages = $archive_iterator->nextChunk();
+                    $chunk = true;
+                } else {
+                    $pages = $archive_iterator->nextPages(
+                        C\ARCHIVE_BATCH_SIZE);
+                }
+            }
+            if (file_exists($lock_filename)) {
+                unlink($lock_filename);
+            }
+        }
+        if ($archive_iterator && $archive_iterator->end_of_iterator) {
+            $info[self::END_ITERATOR] = true;
+        }
+        if (($chunk && $pages) || ($pages && !empty($pages))) {
+            $pages_string = L\webencode(gzcompress(serialize($pages)));
+        } else {
+            $info[self::STATUS] = self::NO_DATA_STATE;
+            $info[self::POST_MAX_SIZE] = L\metricToInt(
+                ini_get("post_max_size"));
+            $pages = [];
+            $pages_string = L\webencode(gzcompress(serialize($pages)));
+        }
+        $info[self::DATA] = $pages_string;
+        $info_string = serialize($info);
+        $data['MESSAGE'] = $info_string;
+        $this->displayView($view, $data);
+    }
+    /**
+     * Checks if the queue server crawl needs to be restarted
+     * @param string $crawl_type if it does use restart the crawl as a crawl
+     *     of this type. For example, self::WEB_CRAWL or self::ARCHIVE_CRAWL
+     */
+    public function checkRestart($crawl_type)
+    {
+        if (isset($_REQUEST['crawl_time'])) {;
+            $crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'),
+                0, C\TIMESTAMP_LEN);
+            if (isset($_REQUEST['check_crawl_time'])) {
+                $check_crawl_time = substr($this->clean(
+                    $_REQUEST['check_crawl_time'], 'int'), 0, C\TIMESTAMP_LEN);
+            }
+        } else {
+            $crawl_time = 0;
+            $check_crawl_time = 0;
+        }
+        $index_schedule_file = C\CRAWL_DIR."/schedules/" .
+            self::index_closed_name . $crawl_time. ".txt";
+        if ($crawl_time > 0 && file_exists($index_schedule_file) &&
+            $check_crawl_time > intval(fileatime($index_schedule_file)) &&
+            !file_exists(C\CRAWL_DIR.
+                "/schedules/QueueServerMessages.txt") ) {
+            $restart = true;
+            if (file_exists(C\CRAWL_DIR."/schedules/crawl_status.txt")) {
+                $crawl_status = unserialize(file_get_contents(
+                    C\CRAWL_DIR."/schedules/crawl_status.txt"));
+                if ($crawl_status['CRAWL_TIME'] != 0) {
+                    $restart = false;
+                }
+            }
+            if ($restart == true && file_exists(C\CRAWL_DIR.'/cache/'.
+                self::index_data_base_name.$crawl_time)) {
+                $crawl_params = [];
+                $crawl_params[self::STATUS] = "RESUME_CRAWL";
+                $crawl_params[self::CRAWL_TIME] = $crawl_time;
+                $crawl_params[self::CRAWL_TYPE] = $crawl_type;
+                /*
+                    we only set crawl time. Other data such as allowed sites
+                    should come from index.
+                */
+                $this->model("crawl")->sendStartCrawlMessage($crawl_params,
+                    null, null);
+            }
+        }
+    }
+    /**
+     * Processes Robot, To Crawl, and Index data sent from a fetcher
+     * Acknowledge to the fetcher if this data was received okay.
+     */
+    public function update()
+    {
+        $view = "fetch";
+        $info_flag = false;
+        $logging = "";
+        $necessary_fields = ['byte_counts', 'current_part', 'hash_data',
+            'hash_part', 'num_parts', 'part'];
+        $part_flag = true;
+        $missing = "";
+        foreach ($necessary_fields as $field) {
+            if (!isset($_REQUEST[$field])) {
+                $part_flag = false;
+                $missing = $field;
+            }
+        }
+        if (isset($_REQUEST['crawl_type'])) {
+            $this->checkRestart($this->clean(
+                $_REQUEST['crawl_type'], 'string'));
+        }
+        if ($part_flag &&
+            L\crawlHash($_REQUEST['part']) == $_REQUEST['hash_part']) {
+            $upload = false;
+            if (intval($_REQUEST['num_parts']) > 1) {
+                $info_flag = true;
+                if (!file_exists(C\CRAWL_DIR."/temp")) {
+                    mkdir(C\CRAWL_DIR."/temp");
+                    L\setWorldPermissionsRecursive(C\CRAWL_DIR."/temp/");
+                }
+                $filename = C\CRAWL_DIR."/temp/".$_REQUEST['hash_data'];
+                file_put_contents($filename, $_REQUEST['part'], FILE_APPEND);
+                L\setWorldPermissions($filename);
+                if ($_REQUEST['num_parts'] == $_REQUEST['current_part']) {
+                    $upload = true;
+                }
+            } else if (intval($_REQUEST['num_parts']) == 1) {
+                $info_flag = true;
+                $upload = true;
+                $filename = "";
+            }
+            if ($upload) {
+                $logging = $this->handleUploadedData($filename);
+            } else {
+                $logging = "...".(
+                    $_REQUEST['current_part']/$_REQUEST['num_parts']).
+                    " of data uploaded.";
+            }
+        }
+        $info =[];
+        if ($logging != "") {
+            $info[self::LOGGING] = $logging;
+        }
+        if ($info_flag == true) {
+            $info[self::STATUS] = self::CONTINUE_STATE;
+        } else {
+            $info[self::STATUS] = self::REDO_STATE;
+            if (!$part_flag) {
+                $info[self::SUMMARY] = "Missing request field: $missing.";
+            } else {
+                $info[self::SUMMARY] = "Hash of uploaded data was:".
+                    L\crawlHash($_REQUEST['part']).". Sent checksum was:".
+                    $_REQUEST['hash_part'];
+            }
+        }
+        $info[self::MEMORY_USAGE] = memory_get_peak_usage();
+        $info[self::POST_MAX_SIZE] = L\metricToInt(ini_get("post_max_size"));
+        if (file_exists(C\CRAWL_DIR . "/schedules/crawl_status.txt")) {
+            $change = false;
+            $crawl_status = unserialize(
+                file_get_contents(C\CRAWL_DIR . "/schedules/crawl_status.txt"));
+            if (isset($_REQUEST['fetcher_peak_memory'])) {
+                if (!isset($crawl_status['FETCHER_MEMORY']) ||
+                    $_REQUEST['fetcher_peak_memory'] >
+                    $crawl_status['FETCHER_PEAK_MEMORY']
+                ) {
+                    $crawl_status['FETCHER_PEAK_MEMORY'] =
+                        $_REQUEST['fetcher_peak_memory'];
+                    $change = true;
+                }
+            }
+            if (!isset($crawl_status['WEBAPP_PEAK_MEMORY']) ||
+                $info[self::MEMORY_USAGE] >
+                $crawl_status['WEBAPP_PEAK_MEMORY']) {
+                $crawl_status['WEBAPP_PEAK_MEMORY'] =
+                    $info[self::MEMORY_USAGE];
+                $change = true;
+            }
+            if (!isset($crawl_status[self::CRAWL_TIME])) {
+                $network_filename = C\CRAWL_DIR."/schedules/network_status.txt";
+                if (file_exists($network_filename)) {
+                    $info[self::CRAWL_TIME] = unserialize(file_get_contents(
+                        $network_filename));
+                    $change = true;
+                } else {
+                    $info[self::CRAWL_TIME] = 0;
+                }
+            } else {
+                $info[self::CRAWL_TIME] = $crawl_status['CRAWL_TIME'];
+            }
+            if ($change == true) {
+                file_put_contents(C\CRAWL_DIR."/schedules/crawl_status.txt",
+                    serialize($crawl_status), LOCK_EX);
+            }
+        } else {
+            $info[self::CRAWL_TIME] = 0;
+        }
+        $info[self::MEMORY_USAGE] = memory_get_peak_usage();
+        $data = [];
+        $data['MESSAGE'] = serialize($info);
+        $this->displayView($view, $data);
+    }
+    /**
+     * After robot, schedule, and index data have been uploaded and reassembled
+     * as one big data file/string, this function splits that string into
+     * each of these data types and then save the result into the appropriate
+     * schedule sub-folder. Any temporary files used during uploading are then
+     * deleted.
+     *
+     * @param string $filename name of temp file used to upload big string.
+     *     If uploaded data was small enough to be uploaded in one go, then
+     *     this should be "" -- the variable $_REQUEST["part"] will be used
+     *     instead
+     * @return string $logging diagnostic info to be sent to fetcher about
+     *     what was done
+     */
+    public function handleUploadedData($filename = "")
+    {
+        if ($filename == "") {
+            $uploaded = $_REQUEST['part'];
+        } else {
+            $uploaded = file_get_contents($filename);
+            unlink($filename);
+        }
+        $logging = "... Data upload complete\n";
+        $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
+        $address = str_replace(":", "_", $address);
+        $time = time();
+        $day = floor($time/C\ONE_DAY);
+        $byte_counts = [];
+        if (isset($_REQUEST['byte_counts'])) {
+            $byte_counts = unserialize(L\webdecode($_REQUEST['byte_counts']));
+        }
+        $robot_data = "";
+        $cache_page_validation_data = "";
+        $schedule_data = "";
+        $index_data = "";
+        if (isset($byte_counts["TOTAL"]) &&
+            $byte_counts["TOTAL"] > 0) {
+            $pos = 0;
+            $robot_data = substr($uploaded, $pos, $byte_counts["ROBOT"]);
+            $pos += $byte_counts["ROBOT"];
+            $cache_page_validation_data = substr($uploaded, $pos,
+                $byte_counts["CACHE_PAGE_VALIDATION"]);
+            $pos += $byte_counts["CACHE_PAGE_VALIDATION"];
+            $schedule_data =
+                substr($uploaded, $pos, $byte_counts["SCHEDULE"]);
+            $pos += $byte_counts["SCHEDULE"];
+            $index_data =
+                substr($uploaded, $pos);
+        }
+        if (strlen($robot_data) > 0) {
+            $this->addScheduleToScheduleDirectory(self::robot_data_base_name,
+                $robot_data);
+        }
+        if (C\USE_ETAG_EXPIRES && strlen($cache_page_validation_data) > 0) {
+            $this->addScheduleToScheduleDirectory(
+                self::etag_expires_data_base_name,
+                $cache_page_validation_data);
+        }
+        if (strlen($schedule_data) > 0) {
+            $this->addScheduleToScheduleDirectory(self::schedule_data_base_name,
+                $schedule_data);
+        }
+        if (strlen($index_data) > 0) {
+            $this->addScheduleToScheduleDirectory(self::index_data_base_name,
+                $index_data);
+        }
+        return $logging;
+    }
+    /**
+     * Adds a file with contents $data and with name containing $address and
+     * $time to a subfolder $day of a folder $dir
+     *
+     * @param string $schedule_name the name of the kind of schedule being saved
+     * @param string& $data_string encoded, compressed, serialized data the
+     *     schedule is to contain
+     */
+    public function addScheduleToScheduleDirectory($schedule_name,
+        &$data_string)
+    {
+        $crawl_time = substr($this->clean($_REQUEST['crawl_time'], "int"), 0,
+            C\TIMESTAMP_LEN);
+        $dir = C\CRAWL_DIR . "/schedules/".$schedule_name . $crawl_time;
+        $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
+        $address = str_replace(":", "_", $address);
+        $time = time();
+        $day = floor($time/C\ONE_DAY);
+        if (!file_exists($dir)) {
+            mkdir($dir);
+            chmod($dir, 0777);
+        }
+        $dir .= "/$day";
+        if (!file_exists($dir)) {
+            mkdir($dir);
+            chmod($dir, 0777);
+        }
+        $data_hash = L\crawlHash($data_string);
+        file_put_contents($dir."/At".$time."From".$address.
+            "WithHash$data_hash.txt", $data_string);
+    }
+    /**
+     * Checks for the crawl time according either to crawl_status.txt or to
+     * network_status.txt, and presents it to the requesting fetcher, along
+     * with a list of available queue servers.
+     */
+    public function crawlTime()
+    {
+        $info = [];
+        $info[self::STATUS] = self::CONTINUE_STATE;
+        $view = "fetch";
+        $cron_model = $this->model("cron");
+        if (isset($_REQUEST['crawl_time'])) {;
+            $prev_crawl_time = substr(
+                $this->clean($_REQUEST['crawl_time'], 'int'), 0,
+                C\TIMESTAMP_LEN);
+        } else {
+            $prev_crawl_time = 0;
+        }
+        $cron_time = $cron_model->getCronTime("fetcher_restart");
+        $delta = time() - $cron_time;
+        if ($delta > self::CRON_INTERVAL) {
+            $cron_model->updateCronTime("fetcher_restart");
+            $this->doFetcherCronTasks();
+        } else if ($delta == 0) {
+            $cron_model->updateCronTime("fetcher_restart");
+        }
+        $local_filename = C\CRAWL_DIR."/schedules/crawl_status.txt";
+        $network_filename = C\CRAWL_DIR."/schedules/network_status.txt";
+        if (file_exists($local_filename)) {
+            $crawl_status = unserialize(file_get_contents($local_filename));
+            $crawl_time = (isset($crawl_status["CRAWL_TIME"])) ?
+                $crawl_status["CRAWL_TIME"] : 0;
+        } else if (file_exists($network_filename)){
+            $crawl_time = unserialize(file_get_contents($network_filename));
+        } else {
+            $crawl_time = 0;
+        }
+        $info[self::CRAWL_TIME] = $crawl_time;
+        $status_filename = C\CRAWL_DIR."/schedules/NameServerMessages.txt";
+        if ($crawl_time != 0 && file_exists($status_filename)) {
+            $status = unserialize(file_get_contents($status_filename));
+            if ($status[self::STATUS] == 'STOP_CRAWL') {
+                $info[self::STATUS] == 'STOP_CRAWL';
+                $info[self::CRAWL_TIME] = 0;
+            } else {
+                $tmp_base_dir = C\CRAWL_DIR."/schedules/".
+                    self::index_data_base_name . $crawl_time;
+                $tmp_dirs = glob($tmp_base_dir.'/*', GLOB_ONLYDIR);
+                $mult_factor = max(1, count($tmp_dirs));
+                $info[self::MINIMUM_FETCH_LOOP_TIME] = max(min(
+                    $mult_factor * C\MINIMUM_FETCH_LOOP_TIME,
+                    C\PROCESS_TIMEOUT/2), C\MINIMUM_FETCH_LOOP_TIME);
+            }
+            if ($status[self::STATUS] != 'STOP_CRAWL'  &&
+                $crawl_time != $prev_crawl_time) {
+                $to_copy_fields = [self::ALLOWED_SITES, self::ARC_DIR,
+                    self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE,
+                    self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES,
+                    self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL,
+                    self::SUMMARIZER_OPTION, self::TOR_PROXY
+                    ];
+                foreach ($to_copy_fields as $field) {
+                    if (isset($status[$field])) {
+                        $info[$field] = $status[$field];
+                    }
+                }
+                /*
+                   When initiating a new crawl AND there are active
+                   classifiers (an array of class labels), then augment the
+                   info with compressed, serialized versions of each active
+                   classifier so that each fetcher can reconstruct the same
+                   classifiers.
+                 */
+                $classifier_array = [];
+                if (isset($status[self::ACTIVE_CLASSIFIERS])) {
+                    $classifier_array = array_merge(
+                        $status[self::ACTIVE_CLASSIFIERS]);
+                    $info[self::ACTIVE_CLASSIFIERS] =
+                        $status[self::ACTIVE_CLASSIFIERS];
+                }
+                if (isset($status[self::ACTIVE_RANKERS])) {
+                    $classifier_array = array_merge($classifier_array,
+                        $status[self::ACTIVE_RANKERS]);
+                    $info[self::ACTIVE_RANKERS] =
+                        $status[self::ACTIVE_RANKERS];
+                }
+                if ($classifier_array != []) {
+                    $classifiers_data = Classifier::loadClassifiersData(
+                            $classifier_array);
+                    $info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data;
+                }
+            }
+        }
+        $info[self::SCRAPERS] = base64_encode(
+            serialize($this->model("scraper")->getAllScrapers()));
+        $info[self::QUEUE_SERVERS] =
+            $this->model("machine")->getQueueServerUrls();
+        $info[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes();
+        $info[self::POST_MAX_SIZE] = L\metricToInt(ini_get("post_max_size"));
+        if (count($info[self::QUEUE_SERVERS]) == 0) {
+            $info[self::QUEUE_SERVERS] = [C\NAME_SERVER];
+        }
+        $data = [];
+        $data['MESSAGE'] = serialize($info);
+        $this->displayView($view, $data);
+    }
+    /**
+     * Used to do periodic maintenance tasks for the Name Server.
+     * For now, just checks if any fetchers which the user turned on
+     * have crashed and if so restarts them
+     */
+    public function doFetcherCronTasks()
+    {
+        $this->model("machine")->restartCrashedFetchers();
+    }
+    /**
+     * Gets a list of all the timestamps of previously stored crawls
+     *
+     * This could probably be moved to crawl model. It is a little lighter
+     * than getCrawlList and should be only used with a name server so leaving
+     * it here so it won't be confused.
+     *
+     * @return array list of timestamps
+     */
+    public function getCrawlTimes()
+    {
+        $list = [];
+        $dirs = glob(C\CRAWL_DIR.'/cache/*');
+
+        foreach ($dirs as $dir) {
+            if (strlen($pre_timestamp = strstr($dir,
+                self::index_data_base_name)) > 0) {
+                $list[] = substr($pre_timestamp,
+                    strlen(self::index_data_base_name));
+            }
+            if (strlen($pre_timestamp = strstr($dir,
+                self::network_base_name)) > 0) {
+                $tmp = substr($pre_timestamp,
+                    strlen(self::network_base_name), -4);
+                if (is_numeric($tmp)) {
+                    $list[] = $tmp;
+                }
+            }
+        }
+        $list = array_unique($list);
+        return $list;
+    }
+}
--- a/src/controllers/GroupController.php
+++ b/src/controllers/GroupController.php
@ -0,0 +1,237 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop as B;
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\WikiParser;
+
+/**
+ * Controller used to handle user group activities outside of
+ * the admin panel setting. This either could be because the admin panel
+ * is "collapsed" or because the request concerns a wiki page.
+ *
+ * @author Chris Pollett
+ */
+class GroupController extends Controller implements CrawlConstants
+{
+    /**
+     * Says which activities (roughly methods invoke from the web) this
+     * controller will respond to (note: more activities will be loaded from
+     * components)
+     * @var array
+     */
+    public $activities = ["groupFeeds", "wiki"];
+    /**
+     * Associative array of $components activities for this controller
+     * Components are collections of activities (a little like traits) which
+     * can be reused.
+     *
+     * @var array
+     */
+    public static $component_activities = ["social" => ["groupFeeds", "wiki"]];
+    /**
+     * Used to process requests related to user group activities outside of
+     * the admin panel setting. This either could be because the admin panel
+     * is "collapsed" or because the request concerns a wiki page.
+     */
+    public function processRequest()
+    {
+        $data = [];
+        if (!C\PROFILE) {
+            return $this->configureRequest();
+        }
+        if (isset($_SESSION['USER_ID'])) {
+            $user_id = $_SESSION['USER_ID'];
+            $data['ADMIN'] = 1;
+        } else {
+            $user_id = $_SERVER['REMOTE_ADDR'];
+        }
+        $signin_model = $this->model("signin");
+        $data['USERNAME'] = $signin_model->getUserName($user_id);
+        $data['SCRIPT'] = "";
+        $token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user_id);
+        $data[C\CSRF_TOKEN] = $this->generateCSRFToken($user_id);
+        if (!$token_okay) {
+            $keep_fields = ["a", "arg", "f", "callback", "group_id",
+                "just_group_id", "just_user_id", "just_thread", "limit", "n",
+                "num", "page_id", "page_name", 'v', "group_name", 'sf'];
+            $request = $_REQUEST;
+            $_REQUEST = [];
+            foreach ($keep_fields as $field) {
+                if (isset($request[$field])) {
+                    if ($field == "arg" && (!in_array($request[$field],
+                        ["read", "pages", "media", "statistics"]) )){
+                        continue;
+                    }
+                    $_REQUEST[$field] =
+                        $this->clean($request[$field], "string");
+                }
+            }
+            $_REQUEST["c"] = "group";
+        }
+        $data = array_merge($data, $this->processSession());
+        if (!isset($data['REFRESH'])) {
+            $view = "group";
+        } else {
+            $view = $data['REFRESH'];
+        }
+        if ($data['ACTIVITY_METHOD'] == "wiki") {
+            if (isset($data["VIEW"]) && !isset($data['REFRESH'])) {
+                $view = $data["VIEW"];
+            }
+        } else if (isset($_REQUEST['f']) &&
+            in_array($_REQUEST['f'], ["rss", "json", "serial"])) {
+            $this->setupViewFormatOutput($_REQUEST['f'], $view, $data);
+        }
+        $_SESSION['REMOTE_ADDR'] = $_SERVER['REMOTE_ADDR'];
+        $this->initializeAdFields($data);
+        $this->displayView($view, $data);
+    }
+    /**
+     * Used to perform the actual activity call to be done by the
+     * group_controller.
+     * processSession is called from @see processRequest, which does some
+     * cleaning of fields if the CSRFToken is not valid. It is more likely
+     * that that group_controller may be involved in such requests as it can
+     * be invoked either when a user is logged in or not and for users with and
+     * without accounts. processSession makes sure the $_REQUEST'd activity is
+     * valid (or falls back to groupFeeds) then calls it. If someone uses
+     * the Settings link to change the language or default number of feed
+     * elements to view, this method sets up the $data variable so that
+     * the back/cancel button on that page works correctly.
+     */
+    public function processSession()
+    {
+        if (isset($_REQUEST['a']) &&
+            in_array($_REQUEST['a'], $this->activities)) {
+            $activity = $_REQUEST['a'];
+        } else {
+            $activity = "groupFeeds";
+        }
+        $_SESSION['HIDE_ACTIVITIES'] =  false;
+        if (!empty($_SESSION['USER_ID'])) {
+            $this->model("user")->setUserSession($_SESSION['USER_ID'],
+                $_SESSION);
+        }
+        $data['HIDE_ACTIVITIES'] = false;
+        $data = $this->call($activity);
+        $data['ACTIVITY_CONTROLLER'] = "group";
+        $data['ACTIVITY_METHOD'] = $activity; //for settings controller
+        if (!is_array($data)) {
+            $data = [];
+        }
+        return $data;
+    }
+    /**
+     * Responsible for setting the view for a feed if something other
+     * than HTML (for example, RSS or JSON) is desired. It also
+     * sets up any particular $data fields needed for displaying that
+     * view correctly.
+     *
+     * @param string $format can be one of rss, json, or serialize,
+     *      if different, default HTML GroupView used.
+     * @param string& $view variable used to set the view in calling
+     *     method
+     * @param array& $data used to send data to the view for drawing
+     */
+    public function setupViewFormatOutput($format, &$view, &$data)
+    {
+        $data["QUERY"] = "groups:feed";
+        if (isset($data["JUST_GROUP_ID"])) {
+            $data["QUERY"] = "groups:just_group_id:".$data["JUST_GROUP_ID"];
+        }
+        if (isset($data["JUST_USER_ID"])) {
+            $data["QUERY"] = "groups:just_user_id:".$data["JUST_USER_ID"];
+        }
+        if (isset($data["JUST_THREAD"])) {
+            $data["QUERY"] = "groups:just_thread:".$data["JUST_THREAD"];
+        }
+        $data["its"] = 0;
+        $num_pages = count($data["PAGES"]);
+        $token = empty($data['admin']) ? "" :
+            C\CSRF_TOKEN . "=".  $data[C\CSRF_TOKEN];
+        for ($i = 0; $i < $num_pages; $i++) {
+            $data["PAGES"][$i][self::URL] = htmlentities(B\feedsUrl(
+                "thread", $data["PAGES"][$i]['PARENT_ID'],
+                !empty($data['admin']), $data['CONTROLLER'])) . $token;
+        }
+        switch ($format) {
+            case "rss":
+                $view = "rss";
+                break;
+            case "json":
+                $out_data = [];
+                $out_data["language"] = L\getLocaleTag();
+                $out_data["link"] =
+                    C\NAME_SERVER."?f=$format&amp;q={$data['QUERY']}";
+                $out_data["totalResults"] = $data['TOTAL_ROWS'];
+                $out_data["startIndex"] = $data['LIMIT'];
+                $out_data["itemsPerPage"] = $data['RESULTS_PER_PAGE'];
+                foreach ($data['PAGES'] as $page) {
+                    $item = [];
+                    $item["title"] = $page[self::TITLE];
+                    if (!isset($page[self::TYPE]) ||
+                    (isset($page[self::TYPE])
+                    && $page[self::TYPE] != "link")) {
+                        $item["link"] = $page[self::URL];
+                    } else {
+                        $item["link"] = strip_tags($page[self::TITLE]);
+                    }
+                    $item["description"] = strip_tags($page[self::DESCRIPTION]);
+                    if (isset($page[self::THUMB])
+                    && $page[self::THUMB] != 'null') {
+                        $item["thumb"] = $page[self::THUMB];
+                    }
+                    if (isset($page[self::TYPE])) {
+                        $item["type"] = $page[self::TYPE];
+                    }
+                    $out_data['item'][] =$item;
+                }
+                $out = json_encode($out_data);
+                //jsonp format
+                if (isset($_REQUEST['callback'])) {
+                    $callback = $this->clean($_REQUEST['callback'], "string");
+                    $out = "// API callback\n$callback($out);";
+                    header("Content-Type: text/javascript; charset=UTF-8");
+                } else {
+                    header("Content-Type: application/json");
+                }
+                e($out);
+                exit();
+            case "serial":
+                e(serialize($out_data));
+                exit();
+        }
+    }
+}
--- a/src/controllers/JobsController.php
+++ b/src/controllers/JobsController.php
@ -0,0 +1,114 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\MediaConstants;
+use seekquarry\yioop\library\UrlParser;
+
+/**
+ * This class is used to handle requests from a MediaUpdater to a name server
+ * There are three main types of requests: getUpdateProperties, and
+ * for any job that the MediaUpdater might be running, its getTasks, and
+ * putTasks request. getUpdateProperties is supposed to provide configuration
+ * settings for the MediaUpdater. A MediaUpdater might be running several
+ * periodic jobs. The getTasks requests of a job is used to see if there
+ * is any new work available of that job type on the name server. A
+ * putTasks request is used to handle any computed data sent back from a
+ * MediaUpdater to the name server.
+ *
+ * @author Chris Pollett
+ */
+class JobsController extends Controller implements CrawlConstants,
+    MediaConstants
+{
+    /**
+     * These are the activities supported by this controller
+     * @var array
+     */
+    public $activities = ["getUpdateProperties"];
+    /**
+     * Checks that the request seems to be coming from a legitimate
+     * MediaUpdater then determines which job's activity is being
+     * requested and calls that activity for processing.
+     *
+     */
+    public function processRequest()
+    {
+        $data = [];
+        /* do a quick test to see if this is a request seems like
+           from a legitimate machine
+         */
+        if (!$this->checkRequest()) {
+            return;
+        }
+        $activity = (isset($_REQUEST['a'])) ? $_REQUEST['a'] : false;
+        if (in_array($activity, $this->activities)) {
+            $this->call($activity);
+        } else if (!empty($_REQUEST['job']) &&
+            !empty($_REQUEST['machine_id']) &&
+            in_array($activity, ["getTasks", "putTasks"])) {
+            $job = $this->clean($_REQUEST['job'], "string");
+            $machine_id = L\webdecode(
+                $this->clean($_REQUEST['machine_id'], "string"));
+            $args = null;
+            if (isset($_REQUEST['args'])) {
+                $args = unserialize(L\webdecode($_REQUEST['args']));
+            }
+            $class_name = C\NS_JOBS . lcfirst($job) . "Job";
+            if (class_exists($class_name)) {
+                $job_object = new $class_name(null, $this);
+                $result = $job_object->$activity($machine_id, $args);
+                echo L\webencode(serialize($result));
+            }
+        }
+    }
+    /**
+     * Used to get the update properties of a media updater. Outputs
+     * either name_server or distributed depending on whether there is
+     * only supposed to be a media updater on the name server or on all
+     * machines in the Yioop instance
+     */
+    public function getUpdateProperties()
+    {
+        $profile_model = $this->model("profile");
+        $profile =  $profile_model->getProfile(C\WORK_DIRECTORY);
+        $response = [];
+        $response['MEDIA_MODE'] = (isset($profile['MEDIA_MODE'])) ?
+            $profile['MEDIA_MODE'] : "name_server";
+        $response['SEND_MAIL_MEDIA_UPDATER'] =
+            (isset($profile['SEND_MAIL_MEDIA_UPDATER'])) ?
+            $profile['SEND_MAIL_MEDIA_UPDATER'] : false;
+        echo L\webencode(serialize($response));
+    }
+}
--- a/src/controllers/MachineController.php
+++ b/src/controllers/MachineController.php
@ -0,0 +1,221 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\CrawlDaemon;
+
+/**
+ * This class handles requests from a computer that is managing several
+ * fetchers and queue_servers. This controller might be used to start, stop
+ * fetchers/queue_server as well as get status on the active fetchers
+ *
+ * @author Chris Pollett
+ */
+class MachineController extends Controller implements CrawlConstants
+{
+    /**
+     * These are the activities supported by this controller
+     * @var array
+     */
+    public $activities = ["statuses", "update", "log"];
+    /**
+     * Number of characters from end of most recent log file to return
+     * on a log request
+     */
+    const LOG_LISTING_LEN = 200000;
+    /**
+     * Checks that the request seems to be coming from a legitimate fetcher then
+     * determines which activity the fetcher is requesting and calls that
+     * activity for processing.
+     *
+     */
+    public function processRequest()
+    {
+        $data = [];
+        /* do a quick test to see if this is a request seems like
+           from a legitimate machine
+         */
+        if (!$this->checkRequest()) {return; }
+        $activity = $_REQUEST['a'];
+        if (in_array($activity, $this->activities)) {
+            $this->call($activity);
+        }
+    }
+    /**
+     * Checks the running/non-running status of the
+     * fetchers and queue_servers of the current Yioop instance
+     */
+    public function statuses()
+    {
+        if (isset($_REQUEST["arg"])) {
+            $hash_url = $this->clean($_REQUEST["arg"], "string");
+            // the next file tells the MediaUpdater what machine it is
+            file_put_contents(C\WORK_DIRECTORY.
+                "/schedules/current_machine_info.txt",
+                $hash_url);
+        }
+        header("Content-Type: application/json");
+        echo json_encode(CrawlDaemon::statuses());
+    }
+    /**
+     * Used to start/stop a queue_server/fetcher of the current Yioop instance
+     * based on the queue_server and fetcher fields of the current $_REQUEST
+     */
+    public function update()
+    {
+        if (!isset($_REQUEST['type']) || !isset($_REQUEST['id']) ||
+            !isset($_REQUEST['action'])) { return; }
+        $statuses = CrawlDaemon::statuses();
+        switch ($_REQUEST['type']) {
+            case 'QueueServer':
+                if ($_REQUEST['action'] == "start" &&
+                    !isset($statuses["QueueServer"][-1])) {
+                    CrawlDaemon::start("QueueServer", 'none',
+                        self::INDEXER, 0);
+                    CrawlDaemon::start("QueueServer", 'none',
+                        self::SCHEDULER, 2);
+                } else if ($_REQUEST['action'] == "stop" &&
+                    isset($statuses["QueueServer"][-1]) ) {
+                    CrawlDaemon::stop("QueueServer");
+                }
+                break;
+            case 'Mirror':
+                if ($_REQUEST['action'] == "start" &&
+                    !isset($statuses["Mirror"][-1])) {
+                    $parent = (isset($_REQUEST['parent'])) ?
+                        $this->clean($_REQUEST['parent'], 'string') : "";
+                    if ($parent) {
+                        file_put_contents(C\CRAWL_DIR .
+                            "/schedules/mirror_parent.txt",
+                            L\webdecode($parent));
+                    }
+                    CrawlDaemon::start("Mirror");
+                } else if ($_REQUEST['Mirror'] == "stop" &&
+                    isset($statuses["Mirror"][-1]) ) {
+                    CrawlDaemon::stop("Mirror");
+                }
+                break;
+            case 'MediaUpdater':
+                if ($_REQUEST['action'] == "start" &&
+                    !isset($statuses["MediaUpdater"][-1])) {
+                    CrawlDaemon::start("MediaUpdater");
+                } else if ($_REQUEST["action"] == "stop" &&
+                    isset($statuses["MediaUpdater"][-1]) ) {
+                    CrawlDaemon::stop("MediaUpdater");
+                }
+                break;
+            case 'Fetcher':
+                $id = $_REQUEST['id'];
+                if ($_REQUEST['action'] == "start" &&
+                    !isset($statuses["Fetcher"][$id ]) ) {
+                    CrawlDaemon::start("Fetcher", "$id");
+                } else if ($_REQUEST['action'] == "stop" &&
+                    isset($statuses["Fetcher"][$id]) ) {
+                    CrawlDaemon::stop("Fetcher", "$id");
+                }
+                break;
+            case 'RestartFetcher':
+                $error_log = C\CRASH_LOG_NAME;
+                $id = $_REQUEST['id'];
+                $msg = "Restarting Fetcher $id";
+                $time_string = date("r", time());
+                $out_msg = "[$time_string] $msg\n";
+                $lines = L\tail(C\LOG_DIR."/$id-Fetcher.log", 10);
+                foreach ($lines as $line) {
+                    $out_msg .= "!!!!$line\n";
+                }
+                if (!file_exists($error_log) || filesize($error_log) >
+                    C\MAX_LOG_FILE_SIZE) {
+                    file_put_contents($error_log, $out_msg);
+                } else {
+                    file_put_contents($error_log, $out_msg,
+                        FILE_APPEND);
+                }
+                CrawlDaemon::start("Fetcher", "$id");
+                break;
+        }
+    }
+    /**
+     * Used to retrieve a fetcher/queue_server logfile for the the current
+     * Yioop instance
+     */
+    public function log()
+    {
+        $log_data = "";
+        if (!isset($_REQUEST["type"])) {
+            echo json_encode(urlencode($log_data));
+            return;
+        }
+        switch ($_REQUEST["type"]) {
+            case "Fetcher":
+                $fetcher_num = $this->clean($_REQUEST["id"], "int");
+                $log_file_name = C\LOG_DIR . "/{$fetcher_num}-Fetcher.log";
+                break;
+            case "MediaUpdater":
+            case "Mirror":
+            case "QueueServer":
+                $log_file_name = C\LOG_DIR . "/".$_REQUEST["type"].".log";
+                break;
+        }
+        $filter = "";
+        if (isset($_REQUEST["f"])) {
+            $filter = $this->clean($_REQUEST["f"], "string");
+        }
+        if (file_exists($log_file_name)) {
+            $size = filesize($log_file_name);
+            $len = min(self::LOG_LISTING_LEN, $size);
+            $fh = fopen($log_file_name, "r");
+            if ($fh) {
+                fseek($fh, $size - $len);
+                $log_data = fread($fh, $len);
+                fclose($fh);
+            }
+            if ($filter != "" && strlen($log_data) > 0) {
+                $log_lines = explode("\n", $log_data);
+                $out_lines = [];
+                foreach ($log_lines as $line) {
+                    if (stristr($line, $filter)) {
+                        $out_lines[] = $line;
+                    }
+                }
+                if (count($out_lines) == 0) {
+                    $out_lines[] = tl('machine_controller_nolines');
+                }
+                $log_data = implode("\n", $out_lines);
+            }
+        }
+        header("Content-Type: application/json");
+        echo json_encode(L\webencode($log_data));
+    }
+}
--- a/src/controllers/RegisterController.php
+++ b/src/controllers/RegisterController.php
--- a/src/controllers/ResourceController.php
+++ b/src/controllers/ResourceController.php
@ -0,0 +1,384 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\FetchUrl;
+use seekquarry\yioop\library\UrlParser;
+use seekquarry\yioop\library\MediaConstants;
+use seekquarry\yioop\library\processors\ImageProcessor;
+
+/**
+ * Used to serve resources, css, or scripts such as images from APP_DIR
+ *
+ * @author Chris Pollett
+ */
+class ResourceController extends Controller implements CrawlConstants
+{
+    /**
+     * These are the activities supported by this controller
+     * @var array
+     */
+    public $activities = ["get", "syncList", "syncNotify", "suggest"];
+    /**
+     * Checks that the request seems to be coming from a legitimate fetcher
+     * or mirror server then determines which activity  is being requested
+     * and calls the method for that activity.
+     *
+     */
+    public function processRequest()
+    {
+        if ((isset($_REQUEST['a']) && in_array(
+            $_REQUEST['a'], ["get", "suggest"]))
+            || $this->checkRequest()) {
+            $activity = $_REQUEST['a'];
+            if (in_array($activity, $this->activities)) {
+                $this->call($activity);
+                return;
+            }
+        }
+        $this->requestError();
+    }
+    /**
+     * Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or
+     * CRAWL_DIR/$_REQUEST['f']  after cleaning
+     */
+    public function get()
+    {
+        if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) {
+            return;
+        }
+        $name = $this->clean($_REQUEST['n'], "file_name");
+        if (in_array($_REQUEST['f'], ["css", "scripts", "resources"])) {
+            /* notice in this case we didn't check if request come from a
+               legitimate source but we do try to restrict it to being
+               a file (not a folder) in the above array. If the request
+               is for a file in resources, then if it is for a private
+               group, we will check in getBaseFolder if the request is legit
+            */
+            $base_dir = $this->getBaseFolder();
+            if (!$base_dir) {
+                $this->requestError();
+            }
+            $type = UrlParser::getDocumentType($name);
+            if (!empty($_REQUEST['t']) && $_REQUEST['t'] == 'feed') {
+                $type = "";
+            }
+            $name = UrlParser::getDocumentFilename($name);
+            $name = ($type != "") ? "$name.$type":$name;
+            if (!empty($_REQUEST['t'])) {
+                $name .= ".jpg";
+            }
+            $sub_path = "";
+            if (!empty($_REQUEST['sf'])) {
+                $sub_path = $this->clean($_REQUEST['sf'], "string");
+                $sub_path = str_replace(".", "", $sub_path) . "/";
+                if ($sub_path == "/") {
+                    $sub_path = "";
+                }
+            }
+            $name = $sub_path . $name;
+        } else if (in_array($_REQUEST['f'], ["cache"])) {
+            /*  perform check since these request should come from a known
+                machine
+            */
+            if (!$this->checkRequest()) {
+                $this->requestError();
+            }
+            $folder = $_REQUEST['f'];
+            $base_dir = C\CRAWL_DIR."/$folder";
+        } else {
+            return;
+        }
+        if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) {
+            $offset = $this->clean($_REQUEST['o'], "int");
+            $limit = $this->clean($_REQUEST['l'], "int");
+        }
+        $path = "$base_dir/$name";
+        if (isset($_REQUEST['t']) && $_REQUEST['t'] == 'feed' &&
+            !file_exists($path) && file_exists("$path.txt")) {
+            $image_url = file_get_contents("$path.txt");
+            if (!empty($image_url)) {
+                $image_page = FetchUrl::getPage($image_url);
+                restore_error_handler();
+                $image = @imagecreatefromstring($image_page);
+                set_error_handler(C\NS_LIB . "yioop_error_handler");
+                $thumb = ImageProcessor::createThumb($image);
+                if (!empty($thumb)) {
+                    file_put_contents($path, $thumb);
+                }
+            }
+        }
+        if (file_exists($path)) {
+            $path = realpath($path);
+            $mime_type = L\mimeType($path);
+            $size = filesize($path);
+            $start = 0;
+            $end = $size - 1;
+            header("Content-type: $mime_type");
+            header('Content-Disposition: filename="' .$name. '"');
+            header("Accept-Ranges: bytes");
+            if (isset($_SERVER['HTTP_RANGE'])) {
+                $this->serveRangeRequest($path, $size, $start, $end);
+                return;
+            }
+            header("Content-Length: ".$size);
+            header("Content-Range: bytes $start-$end/$size");
+            if (isset($offset) && isset($limit)) {
+                echo file_get_contents($path, false, null, $offset, $limit);
+            } else {
+                readfile($path);
+            }
+        } else {
+            $this->requestError();
+        }
+    }
+    /**
+     * Handles requests that result in errors to this controller
+     */
+    public function requestError()
+    {
+        header("Location:".C\BASE_URL."/error.php");
+        return;
+    }
+    /**
+     * Computes based on the request the folder that should be used to
+     * find a file during a resource get request. It also checks if user
+     * has access to the requested folder.
+     *
+     * @return mixed either a string with the folder name in it or false if
+     *      the user does not have access or that folder does not exist.
+     */
+    public function getBaseFolder()
+    {
+        $folder = $this->clean($_REQUEST['f'], 'string');
+        $base_dir = C\APP_DIR . "/$folder";
+        $add_to_path = false;
+        $is_group_item = false;
+        $page_id = "";
+        if (isset($_REQUEST['s'])&& !isset($_REQUEST['g']) &&
+            $folder == "resources") {
+            // handle sub-folders of resource (must be numeric)
+            $subfolder = $this->clean($_REQUEST['s'], "hash");
+            $prefix_folder = substr($subfolder, 0, 3);
+            $add_to_path = true;
+        } else if (isset($_REQUEST['g'])) {
+            $user_id = isset($_SESSION['USER_ID']) ? $_SESSION['USER_ID'] :
+                C\PUBLIC_USER_ID;
+            if (isset($_REQUEST['p'])) {
+                $page_id = $this->clean($_REQUEST['p'], 'string');
+            }
+            $group_id = $this->clean($_REQUEST['g'], "int");
+            $group_model = $this->model('group');
+            $token_okay = true;
+            $pre_token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user_id);
+            if ($group_id == C\PUBLIC_GROUP_ID) {
+                $user_id = C\PUBLIC_USER_ID;
+            } else {
+                $token_okay = $pre_token_okay;
+                if (empty($_COOKIE) && stristr($_SERVER['HTTP_USER_AGENT'],
+                    "Mobile") !== false && stristr($_SERVER['HTTP_USER_AGENT'],
+                    "Safari") !== false) {
+                    header('HTTP/1.0 403 Forbidden');
+                    //fixes mobile safari no send cookie bug
+                    exit();
+                }
+            }
+            $group = $group_model->getGroupById($group_id, $user_id);
+            if (!$group || !$token_okay) {
+                return false;
+            }
+            $prefix_word = (isset($_REQUEST['t'])) ? 't' : '';
+            $base_subfolder = L\crawlHash(
+                'group' . $group_id. $page_id . C\AUTH_KEY);
+            $prefix_folder = substr($base_subfolder, 0, 3);
+            $subfolder = $prefix_word . $base_subfolder;
+            $add_to_path = true;
+            $is_group_item = true;
+        }
+        if ($add_to_path) {
+            if ($is_group_item) {
+                $redirect_dir = "$base_dir/$prefix_folder/$base_subfolder";
+            }
+            if ($is_group_item &&
+                file_exists($redirect_dir . "/redirect.txt")) {
+                $tmp_path = file_get_contents($redirect_dir . "/redirect.txt");
+                if (is_dir($tmp_path)) {
+                    if ($subfolder == $base_subfolder) {
+                        $base_dir = $tmp_path;
+                    } else {
+                        $subfolder = L\crawlHash($tmp_path);
+                        $prefix_folder = substr($subfolder, 0, 3);
+                        $subfolder = $prefix_word . $subfolder;
+                        $base_dir .= "/$prefix_folder/$subfolder";
+                    }
+                }
+            } else {
+                $base_dir .= "/$prefix_folder/$subfolder";
+            }
+        }
+        return $base_dir;
+    }
+    /**
+     * Code to handle HTTP range requests of resources. This allows
+     * HTTP pseudo-streaming of video. This code was inspired by:
+     * http://www.tuxxin.com/php-mp4-streaming/
+     *
+     * @param string $file Name of file to serve range request for
+     * @param int $size size of the file in bytes
+     * @param int $start starting byte location want to serve
+     * @param int $end ending byte location want ot serve
+     */
+    public function serveRangeRequest($file, $size, $start, $end)
+    {
+        $current_start = $start;
+        $current_end = $end;
+        list(, $range) = explode('=', $_SERVER['HTTP_RANGE'], 2);
+        if (strpos($range, ',') !== false) {
+            header('HTTP/1.1 416 Requested Range Not Satisfiable');
+            header("Content-Range: bytes $start-$end/$size");
+            return;
+        }
+        if ($range == '-') {
+            $current_start = $size - 1;
+        } else {
+            $range = explode('-', $range);
+            $current_start = trim($range[0]);
+            $current_end = (isset($range[1]) && is_numeric(trim($range[1])))
+                ? trim($range[1]) : $size;
+            if ($current_start === "") {
+                $current_start = max(0, $size - $range[1] - 1);
+            }
+        }
+        $current_end = ($current_end > $end) ? $end : $current_end;
+        if ($current_start > $current_end || $current_start > $size - 1 ||
+            $current_end >= $size) {
+            header('HTTP/1.1 416 Requested Range Not Satisfiable');
+            header("Content-Range: bytes $start-$end/$size");
+            return;
+        }
+        $start = $current_start;
+        $end = $current_end;
+        $length = $end - $start + 1;
+        $fp = @fopen($file, 'rb');
+        fseek($fp, $start);
+        header('HTTP/1.1 206 Partial Content');
+        header("Content-Range: bytes $start-$end/$size");
+        header("Content-Length: ".$length);
+        $buffer = 8192;
+        $position = 0;
+        while(!feof($fp) && $position <= $end && connection_status() == 0) {
+            $position = ftell($fp);
+            if ($position + $buffer > $end) {
+                $buffer = $end - $position + 1;
+            }
+            echo fread($fp, $buffer);
+            flush();
+        }
+        fclose($fp);
+    }
+    /**
+     * Used to get a keyword suggest trie. This sends additional
+     * header so will be decompressed on the fly
+     */
+    public function suggest()
+    {
+        if (!isset($_REQUEST["locale"])){return;}
+        $locale = $_REQUEST["locale"];
+        $count = preg_match("/^[a-zA-z]{2}(-[a-zA-z]{2})?$/", $locale);
+        if ($count != 1) {return;}
+        $locale = str_replace("-", "_", $locale);
+        $path = C\LOCALE_DIR."/$locale/resources/suggest_trie.txt.gz";
+        if (file_exists($path)) {
+            header("Content-Type: application/json");
+            header("Content-Encoding: gzip");
+            header("Content-Length: ".filesize($path));
+            readfile($path);
+        }
+    }
+    /**
+     * Used to notify a machine that another machine acting as a mirror
+     * is still alive. Data is stored in a txt file self::mirror_table_name
+     */
+    public function syncNotify()
+    {
+        if (isset($_REQUEST['last_sync']) && $_REQUEST['last_sync'] > 0 ) {
+            $mirror_table_name = C\CRAWL_DIR."/".self::mirror_table_name;
+            $mirror_table = [];
+            $time = time();
+            if (file_exists($mirror_table_name) ) {
+                $mirror_table = unserialize(
+                    file_get_contents($mirror_table_name));
+                if (isset($mirror_table['time']) &&
+                    $mirror_table['time'] - $time > C\MIRROR_SYNC_FREQUENCY) {
+                    $mirror_table = [];
+                    // truncate table periodically to get rid of stale entries
+                }
+            }
+            if (isset($_REQUEST['robot_instance'])) {
+                $mirror_table['time'] = $time;
+                $mirror_table['machines'][
+                    $this->clean($_REQUEST['robot_instance'], "string")] =
+                    [$_SERVER['REMOTE_ADDR'], $_REQUEST['machine_uri'],
+                    $time,
+                    $this->clean($_REQUEST['last_sync'], "int")];
+                file_put_contents($mirror_table_name, serialize($mirror_table));
+            }
+        }
+    }
+    /**
+     * Returns a list of syncable files and the modification times
+     */
+    public function syncList()
+    {
+        $this->syncNotify();
+        $info = [];
+        if (isset($_REQUEST["last_sync"])) {
+            $last_sync = $this->clean($_REQUEST["last_sync"], "int");
+        } else {
+            $last_sync = 0;
+        }
+        // substrings to exclude from our list
+        $excludes = [".DS", "__MACOSX", "queries", "QueueBundle", "tmp",
+            "thumb"];
+        $sync_files = $this->model("crawl")->getDeltaFileInfo(
+            C\CRAWL_DIR."/cache", $last_sync, $excludes);
+        if (count($sync_files) > 0 ) {
+            $info[self::STATUS] = self::CONTINUE_STATE;
+            $info[self::DATA] = $sync_files;
+        } else {
+            $info[self::STATUS] = self::NO_DATA_STATE;
+        }
+        echo base64_encode(gzcompress(serialize($info)));
+    }
+}
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
--- a/src/controllers/SettingsController.php
+++ b/src/controllers/SettingsController.php
@ -0,0 +1,184 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop as B;
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+
+/**
+ * Controller used to handle search requests to SeekQuarry
+ * search site. Used to both get and display
+ * search results.
+ *
+ * @author Chris Pollett
+ */
+class SettingsController extends Controller
+{
+    /**
+     * Sets up the available perpage language options.
+     * If handling data sent from a  form, it stores cleaned versions of
+     * the number of results per page and language options into a sesssion
+     *
+     */
+    public function processRequest()
+    {
+        $data = [];
+        $view = "settings";
+        $changed_settings_flag = false;
+        $crawl_model = $this->model("crawl");
+        if (isset($_SESSION['USER_ID']) && isset($_REQUEST[C\CSRF_TOKEN])) {
+            $user = $_SESSION['USER_ID'];
+            $token_okay = $this->checkCSRFToken(C\CSRF_TOKEN, $user);
+            $data['ADMIN'] = 1;
+        } else {
+            $user = $_SERVER['REMOTE_ADDR'];
+            $token_okay = true;
+        }
+        if (!$token_okay) {
+            $user = $_SERVER['REMOTE_ADDR'];
+            unset($_SESSION['USER_ID']);
+        }
+        $data[C\CSRF_TOKEN] = $this->generateCSRFToken($user);
+        $languages = $this->model("locale")->getLocaleList();
+        foreach ($languages as $language) {
+            $data['LANGUAGES'][$language['LOCALE_TAG']] =
+                $language['LOCALE_NAME'];
+        }
+        if ($token_okay && isset($_REQUEST['lang']) &&
+            in_array($_REQUEST['lang'], array_keys($data['LANGUAGES']))) {
+            $_SESSION['l'] = $_REQUEST['lang'];
+            L\setLocaleObject( $_SESSION['l']);
+            $changed_settings_flag = true;
+        }
+        $data['LOCALE_TAG'] = L\getLocaleTag();
+        $n = C\NUM_RESULTS_PER_PAGE;
+        $data['PER_PAGE'] =
+            [$n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n];
+        if ($token_okay && isset($_REQUEST['perpage']) &&
+            in_array($_REQUEST['perpage'], array_keys($data['PER_PAGE']))) {
+            $_SESSION['MAX_PAGES_TO_SHOW'] = $_REQUEST['perpage'];
+            $changed_settings_flag = true;
+        }
+        if (isset($_SESSION['MAX_PAGES_TO_SHOW'])){
+            $data['PER_PAGE_SELECTED'] = $_SESSION['MAX_PAGES_TO_SHOW'];
+        } else {
+            $data['PER_PAGE_SELECTED'] = C\NUM_RESULTS_PER_PAGE;
+        }
+        if ($token_okay &&  isset($_REQUEST['perpage'])) {
+            $_SESSION['OPEN_IN_TABS'] = (isset($_REQUEST['open_in_tabs'])) ?
+                true : false;
+        }
+        if (isset($_SESSION['OPEN_IN_TABS'])){
+            $data['OPEN_IN_TABS'] = $_SESSION['OPEN_IN_TABS'];
+        } else {
+            $data['OPEN_IN_TABS'] = false;
+        }
+        $machine_urls = $this->model("machine")->getQueueServerUrls();
+        $crawls = $crawl_model->getCrawlList(false, true, $machine_urls,
+            true);
+        $data['CRAWLS'] = [];
+        foreach ($crawls as $crawl) {
+            $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'].
+                " ... ".$crawl['COUNT']." urls";
+        }
+        $mixes = $crawl_model->getMixList($user);
+        if (!empty($mixes)) {
+            foreach ($mixes as $mix) {
+                $data['CRAWLS'][$mix['TIMESTAMP']] = $mix['NAME'].
+                    " ... ".tl('settings_controller_crawl_mix');
+            }
+        }
+        $crawl_stamps = array_keys($data['CRAWLS']);
+        if ($token_okay) {
+            $changed_settings_flag = $this->loggedInChangeSettings($data);
+        } else if (isset($_REQUEST['its']) &&
+            in_array($_REQUEST['its'],$crawl_stamps)){
+            $data['its'] = $_REQUEST['its'];
+        } else {
+            $data['its'] = $crawl_model->getCurrentIndexDatabaseName();
+        }
+        if ($changed_settings_flag) {
+            $this->model("user")->setUserSession($user, $_SESSION);
+            return $this->redirectWithMessage(
+                tl('settings_controller_settings_saved'),
+                ['return', 'oldc']);
+        }
+        $this->displayView($view, $data);
+    }
+    /**
+     * Changes settings for a logged in user, this might involve storing
+     * data into the active session.
+     *
+     * @param array& $data fields which might be sent to the view
+     * @return bool if any settings were changed
+     */
+    public function loggedInChangeSettings(&$data)
+    {
+        $crawl_model = $this->model("crawl");
+        $crawl_stamps = array_keys($data['CRAWLS']);
+        $changed_settings_flag = false;
+        if (isset($_REQUEST['index_ts']) &&
+            in_array($_REQUEST['index_ts'], $crawl_stamps)) {
+            $_SESSION['its'] = $_REQUEST['index_ts'];
+            $data['its'] = $_REQUEST['index_ts'];
+            $changed_settings_flag = true;
+        } else if (isset($_SESSION['its']) &&
+            in_array($_SESSION['its'], $crawl_stamps)) {
+            $data['its'] = $_SESSION['its'];
+        } else {
+            $data['its'] = $crawl_model->getCurrentIndexDatabaseName();
+        }
+        if (isset($_REQUEST['return'])) {
+            $c = "admin";
+            if (isset($_REQUEST['oldc'])) {
+                $c = $this->clean($_REQUEST['oldc'], "string");
+                $data['oldc'] = $c;
+            }
+            $return = $this->clean($_REQUEST['return'], 'string');
+            $data['return'] = $return;
+            $delim = "?";
+            if (C\REDIRECTS_ON && $c == 'search' && $return == 'more') {
+                $data['RETURN'] = B\moreUrl();
+            } else if ( substr($return, 0, 2) == 's/') {
+                $data['RETURN'] = B\subsearchUrl(substr($return, 2));
+            } else {
+                $data['RETURN'] = B\controllerUrl($c, true) . "a=$return";
+                $delim = '&amp;';
+            }
+            if (!empty($data['ADMIN'])) {
+                $data['RETURN'] .= $delim .
+                    C\CSRF_TOKEN."=".$data[C\CSRF_TOKEN];
+            }
+        }
+        return $changed_settings_flag;
+    }
+}
--- a/src/controllers/StaticController.php
+++ b/src/controllers/StaticController.php
@ -0,0 +1,237 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+
+/**
+ * This controller is  used by the Yioop web site to display
+ * PUBLIC_GROUP_ID pages more like static forward facing pages.
+ *
+ * @author Chris Pollett
+ */
+class StaticController extends Controller
+{
+    /**
+     * Says which activities (roughly methods invoke from the web)
+     * this controller will respond to
+     * @var array
+     */
+    public $activities = ["showPage", "signout"];
+    /**
+     * This is the main entry point for handling people arriving to view
+     * a static page. It determines which page to draw and class the view
+     * to draw it.
+     */
+    public function processRequest()
+    {
+        $data = [];
+        $view = "static";
+        if (isset($_SESSION['USER_ID'])) {
+            $user = $_SESSION['USER_ID'];
+        } else {
+            $user = $_SERVER['REMOTE_ADDR'];
+        }
+        if (isset($_REQUEST['a'])) {
+            if (in_array($_REQUEST['a'], $this->activities)) {
+                $activity = $_REQUEST['a'];
+                if ($activity == "signout") {
+                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >".
+                        tl('static_controller_logout_successful')."</h1>')";
+                    $activity = "showPage";
+                }
+            } else {
+                $activity = "showPage";
+            }
+        } else {
+            $activity = "showPage";
+        }
+        $data['VIEW'] = $view;
+        $data = array_merge($data, $this->call($activity));
+        if (isset($_SESSION['USER_ID'])) {
+            $user = $_SESSION['USER_ID'];
+        } else {
+            $user = $_SERVER['REMOTE_ADDR'];
+        }
+        $data[C\CSRF_TOKEN] = $this->generateCSRFToken($user);
+        if (isset($_SESSION['USER_ID'])) {
+            $user_id = $_SESSION['USER_ID'];
+            $data['ADMIN'] = 1;
+        } else {
+            $user_id = $_SERVER['REMOTE_ADDR'];
+        }
+        $this->initializeAdFields($data);
+        $this->displayView($view, $data);
+    }
+    /**
+     * This activity is used to display one a PUBLIC_GROUP_ID pages used
+     * by the Yioop Web Site
+     *
+     * @return array $data has title and page contents of the static page to
+     *     display
+     */
+    public function showPage()
+    {
+        if (isset($_SESSION['USER_ID'])) {
+            $user = $_SESSION['USER_ID'];
+        } else {
+            $user = $_SERVER['REMOTE_ADDR'];
+        }
+        $data = [];
+        if (isset($_REQUEST['p'])) {
+            $page = $this->clean($_REQUEST['p'], "string");
+            $page = preg_replace("@(\.\.|\/)@", "", $page);
+        } else {
+            $page = "404";
+        }
+        $page_string = $this->getPage($page);
+        if ($page_string == "") {
+            $page = "404";
+            $page_string = $this->getPage($page);
+        }
+        if (strpos($page_string, "`") !== false){
+            if (isset($data["INCLUDE_SCRIPTS"])) {
+                $data["INCLUDE_SCRIPTS"] = [];
+            }
+            $data["INCLUDE_SCRIPTS"][] = "math";
+        }
+        $data['page'] = $page;
+        $static_view = $this->view("static");
+        $this->parsePageHeadVarsView($static_view, $page, $page_string);
+        if (isset($_SESSION['value'])) {
+            $data['value'] = $this->clean($_SESSION['value'], "string");
+        }
+        $head_info = $static_view->head_objects[$data['page']];
+        if (isset($head_info['page_type']) &&
+            $head_info['page_type'] == 'page_alias' &&
+            $head_info['page_alias'] != '' ) {
+            $_REQUEST['p'] = $head_info['page_alias'];
+            return $this->redirectWithMessage("", ['p']);
+        }
+        if ((isset($head_info['title']))) {
+            if ($head_info['title']) {
+                $data["subtitle"] = " - ".$head_info['title'];
+            } else {
+                $data["subtitle"] = "";
+            }
+            $static_view->head_objects[$data['page']]['title'] =
+                tl('static_controller_complete_title', $head_info['title']);
+        } else {
+            $data["subtitle"] = "";
+        }
+        $locale_tag = L\getLocaleTag();
+        $data['CONTROLLER'] = "static";
+        $group_model = $this->model("group");
+        if (!empty($head_info['page_header']) &&
+                $head_info['page_type'] != 'presentation') {
+            $page_header = $group_model->getPageInfoByName(C\PUBLIC_GROUP_ID,
+                $head_info['page_header'], $locale_tag, "read");
+            if (isset($page_header['PAGE'])) {
+                $header_parts =
+                    explode("END_HEAD_VARS", $page_header['PAGE']);
+            }
+            $data["PAGE_HEADER"] = (isset($header_parts[1])) ?
+                $header_parts[1] : "".$page_header['PAGE'];
+        }
+        if (!empty($head_info['page_footer']) &&
+                $head_info['page_type'] != 'presentation') {
+            $page_footer = $group_model->getPageInfoByName(C\PUBLIC_GROUP_ID,
+                $head_info['page_footer'], $locale_tag, "read");
+            if (isset($page_footer['PAGE'])) {
+                $footer_parts =
+                    explode("END_HEAD_VARS", $page_footer['PAGE']);
+            }
+            $data['PAGE_FOOTER'] = (isset($footer_parts[1])) ?
+                $footer_parts[1] : "" . $page_footer['PAGE'];
+        }
+        $data['PAGE_ID'] = $group_model->getPageID(C\PUBLIC_GROUP_ID,
+            $page, $locale_tag);
+        if (!empty($_REQUEST['sf'])) {
+            $sub_path = $this->clean($_REQUEST['sf'], 'string');
+            $sub_path = str_replace(".", "", $sub_path);
+            $data['SUB_PATH'] = htmlentities($sub_path);
+        } else {
+            $sub_path = "";
+        }
+        if (!empty($_REQUEST['arg']) && $_REQUEST['arg']=='media' &&
+            !empty($_REQUEST['n'])) {
+            $data['CURRENT_LOCALE_TAG'] = $locale_tag;
+            $this->component("social")->mediaWiki($data, C\PUBLIC_GROUP_ID,
+                $data['PAGE_ID'], $sub_path);
+        } else if (isset($head_info['page_type'])) {
+            if ($head_info['page_type'] == 'media_list') {
+                $data['GROUP']['GROUP_ID'] = C\PUBLIC_GROUP_ID;
+                $data['HEAD'] = $head_info;
+                $data['PAGE_NAME'] = $page;
+                $data['CAN_EDIT'] = false;
+                $data['MODE'] = "static";
+                $data['RESOURCE_FILTER'] =
+                    (isset($_REQUEST['resource_filter'])) ?
+                     substr($this->clean($_REQUEST['resource_filter'],
+                    'file_name'), 0, C\SHORT_TITLE_LEN) : "";
+                $data['page_type'] = 'media_list';
+                $data['RESOURCES_INFO'] =
+                    $group_model->getGroupPageResourceUrls(
+                        C\PUBLIC_GROUP_ID, $data['PAGE_ID'], $sub_path);
+                $this->component("social")->sortWikiResources($data);
+            } else if ($head_info['page_type'] == 'presentation') {
+                $data['page_type'] = 'presentation';
+                $data['INCLUDE_SCRIPTS'][] =  "slidy";
+                $data['INCLUDE_STYLES'][] =  "slidy";
+            }
+        }
+        return $data;
+    }
+    /**
+     * Used to read in a PUBLIC_GROUP_ID wiki page that will be presented
+     * to non-logged in visitors to the site.
+     *
+     * @param string $page_name name of file less extension to read in
+     * @return string text of page
+     */
+    public function getPage($page_name)
+    {
+        $group_model = $this->model("group");
+        $locale_tag = L\getLocaleTag();
+        $page_info = $group_model->getPageInfoByName(
+            C\PUBLIC_GROUP_ID, $page_name, $locale_tag, "read");
+        $page_string = isset($page_info["PAGE"]) ? $page_info["PAGE"] : "";
+        if (!$page_string && $locale_tag != C\DEFAULT_LOCALE) {
+            //fallback to default locale for translation
+            $page_info = $group_model->getPageInfoByName(
+                C\PUBLIC_GROUP_ID, $page_name, C\DEFAULT_LOCALE, "read");
+            $page_string = $page_info["PAGE"];
+        }
+        $data['CONTROLLER'] = "static";
+        return $page_string;
+    }
+}
--- a/src/controllers/components/AccountaccessComponent.php
+++ b/src/controllers/components/AccountaccessComponent.php
--- a/src/controllers/components/AdvertisementComponent.php
+++ b/src/controllers/components/AdvertisementComponent.php
@ -0,0 +1,484 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Pushkar Umaranikar
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers\components;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\UrlParser;
+
+/**
+ * Component of the Yioop control panel used to handle activitys for
+ * managing advertisements. i.e., create advertisement, activate/
+ * deactivate advertisement, edit advertisement.It is used by AdminController
+ *
+ * @author Pushkar Umaranikar
+ */
+class AdvertisementComponent extends Component
+{
+    /**
+     * Used to manage the purchase and storage of advertising credits
+     *
+     * @return array $data field variables necessary for display of view
+     */
+    public function manageCredits()
+    {
+        $parent = $this->parent;
+        $credit_model = $parent->model("credit");
+        $signin_model = $parent->model("signin");
+        $user_model = $parent->model("user");
+        $data = [];
+        $data['SCRIPT'] = "";
+        $data['MESSAGE'] = "";
+        $data["ELEMENT"] = "managecredits";
+        $data["AMOUNTS"] = [0 => tl('advertisement_component_credit_amounts'),
+            "10" => tl('advertisement_component_ten_in_credits'),
+            "20" => tl('advertisement_component_twenty_in_credits'),
+            "50" => tl('advertisement_component_fifty_in_credits'),
+            "100" => tl('advertisement_component_hundred_in_credits'),
+        ];
+        $data['COST_AMOUNTS'] = [
+            10 => 1000, 20 => 2000, 50 => 5000, 100 => 10000,
+        ];
+        $data['MONTHS'] = [ 0 => tl('advertisement_component_month'),
+            "01" => "01", "02" => "02", "03" => "03",
+            "04" => "04", "05" => "05", "06" => "06", "07" => "07",
+            "08" => "08", "09" => "09", "10" => "10", "11" => "11",
+            "12" => "12"
+        ];
+        $user_id = $_SESSION['USER_ID'];
+        $username = $signin_model->getUserName($user_id);
+        $data["USER"] = $user_model->getUser($username);
+        $data["USER_ID"] = $user_id;
+        $current_year = date('Y');
+        $data['YEARS'] = [ 0 => tl('advertisement_component_year')];
+        for ( $year = $current_year; $year < $current_year + 20; $year++ ) {
+            $data['YEARS'][$year] = $year;
+        }
+        $arg = (isset($_REQUEST['arg'])) ? $parent->clean($_REQUEST['arg'],
+            "string") : "";
+        $num_dollars = (isset($_REQUEST['NUM_DOLLARS']) &&
+            isset($data['COST_AMOUNTS'][$_REQUEST['NUM_DOLLARS']])) ?
+            $_REQUEST['NUM_DOLLARS'] : 0;
+        $data['BALANCE'] = $credit_model->getCreditBalance($user_id);
+        if (C\CreditConfig::isActive() && (!($user_id == C\ROOT_ID &&
+            C\ALLOW_FREE_ROOT_CREDIT_PURCHASE))) {
+            $data["INCLUDE_SCRIPTS"][] = 'credit';
+            $ad_script_found = false;
+            for ($i = C\YIOOP_VERSION; $i >= C\MIN_AD_VERSION; $i--) {
+                $get_credit_token_initialize_script =
+                    "FN" . md5(UrlParser::getBaseDomain(C\NAME_SERVER) .
+                    $i . "getCreditTokenInitializeScript");
+                if (method_exists( C\NS_CONFIGS . "CreditConfig",
+                    $get_credit_token_initialize_script)) {
+                    $ad_script_found = true;
+                    break;
+                }
+            }
+            if ($ad_script_found) {
+                $data['SCRIPT'] .=
+                    C\CreditConfig::$get_credit_token_initialize_script();
+            } else {
+                $data['DISPLAY_MESSAGE'] =
+                    tl('advertisement_component_script_failure');
+            }
+        }
+        switch ($arg)
+        {
+            case "purchaseCredits":
+                $message = "";
+                if ($num_dollars <= 0) {
+                    return $parent->redirectWithMessage(
+                        tl('advertisement_component_invalid_credit_quantity'),
+                        []);
+                }
+                /*  string to translate stored in column of 32 chars
+                    so not writing advertisement_component
+                 */
+                $strings_to_translate_for_model = [
+                    tl('advertisement_buy_credits'),
+                    tl('advertisement_init_ledger')];
+                $token = $parent->clean($_REQUEST['CREDIT_TOKEN'], "string");
+                if (!($user_id == C\ROOT_ID &&
+                    C\ALLOW_FREE_ROOT_CREDIT_PURCHASE)) {
+                    $is_active = C\CreditConfig::isActive();
+                    if ($is_active && empty($token)) {
+                        return $parent->redirectWithMessage(
+                            tl('advertisement_component_credit_token_empty'),
+                            []);
+                    }
+                    if ($is_active && !C\CreditConfig::charge(
+                        $num_dollars, $parent->clean(
+                        $_REQUEST['CREDIT_TOKEN'], "string"), $message)) {
+                        return $parent->redirectWithMessage(
+                            tl('advertisement_component_processing_error',
+                                $message), []);
+                    }
+                }
+                $credit_model->updateCredits($user_id,
+                    $data['COST_AMOUNTS'][$num_dollars],
+                    'advertisement_buy_credits');
+                return $parent->redirectWithMessage(
+                    tl('advertisement_component_credits_purchased'),
+                    []);
+                break;
+        }
+        $search_array = [["timestamp", "", "", "DESC"]];
+        $parent->pagingLogic($data, $credit_model, "TRANSACTIONS",
+            C\DEFAULT_ADMIN_PAGING_NUM, $search_array, "",
+            ["USER_ID" => $user_id]);
+        return $data;
+    }
+    /**
+     * Used to handle the Create, Edit and Activation of Advertisements
+     *
+     * @return array $data field variables necessary for display of view
+     */
+    public function manageAdvertisements()
+    {
+        $parent = $this->parent;
+        $signin_model = $parent->model("signin");
+        $user_model = $parent->model("user");
+        $role_model = $parent->model('role');
+        $advertisement_model = $parent->model("advertisement");
+        $credit_model = $parent->model("credit");
+        $data = [];
+        $data['DURATIONS'] = [ 0 => tl('advertisement_component_num_days'),
+            1 => tl('advertisement_component_one_day'),
+            7 => tl('advertisement_component_seven_days'),
+            30 => tl('advertisement_component_thirty_days'),
+            90 => tl('advertisement_component_ninety_days'),
+            180 => tl('advertisement_component_one_eighty_days'),
+        ];
+        $request_field_types = [
+            'context' => 'string',
+            "start_row" => 'int', "num_show"  => 'int', "end_row"  => 'int',
+            "NAME" => 'string', "DESTINATION" => 'web-url',
+            "DESCRIPTION"  => 'string', "KEYWORDS"  => 'string',
+            "BUDGET"  => 'int', "DURATION" => array_keys($data['DURATIONS']),
+            'id' => 'int', 'status' => 'int'];
+        $request_fields = array_keys($request_field_types);
+        $data['MONTHS'] = [ 0 => tl('advertisement_component_month'),
+            "01" => "01", "02" => "02", "03" => "03",
+            "04" => "04", "05" => "05", "06" => "06", "07" => "07",
+            "08" => "08", "09" => "09", "10" => "10", "11" => "11",
+            "12" => "12"
+        ];
+        $current_year = date('Y');
+        $data['YEARS'] = [ 0 => tl('advertisement_component_year')];
+        for ( $year = $current_year; $year < $current_year + 20; $year++ ) {
+            $data['YEARS'][$year] = $year;
+        }
+        $data['SCRIPT'] = "";
+        $data['MESSAGE'] = "";
+        $data["ELEMENT"] = "manageadvertisements";
+        $data['FORM_TYPE'] = "addadvertisement";
+        $data['DURATION'] = 0;
+        foreach ($request_field_types as $field => $type) {
+            if (isset($_REQUEST[$field])) {
+                $data[$field] = $parent->clean($_REQUEST[$field], $type);
+            }
+        }
+        if (isset($_REQUEST['EDIT_AD'])) {
+            unset($_REQUEST['CALCULATE']);
+            unset($_REQUEST['arg']);
+        }
+        if (isset($_REQUEST['CALCULATE']) || (isset($_REQUEST['arg']) &&
+            $_REQUEST['arg'] == "addadvertisement")) {
+            if (empty($_REQUEST['NAME']) ||
+                empty($_REQUEST['DESCRIPTION']) ||
+                empty($_REQUEST['DESTINATION'])) {
+                return $parent->redirectWithMessage(
+                    tl('advertisement_component_fields_cannot_be_empty'),
+                    array_merge([$_REQUEST['arg']], $request_fields));
+            }
+            if (!isset($_REQUEST['DURATION']) || $_REQUEST['DURATION'] == 0) {
+                return $parent->redirectWithMessage(
+                    tl('advertisement_component_duration_cannot_be_empty'),
+                    array_merge([$_REQUEST['arg']], $request_fields));
+            }
+            if (empty($_REQUEST['KEYWORDS'])) {
+                return $parent->redirectWithMessage(
+                    tl('advertisement_component_enter_keywords'),
+                    array_merge([$_REQUEST['arg']], $request_fields));
+            }
+            $data['START_DATE'] = date(C\AD_DATE_FORMAT);
+            $_REQUEST['START_DATE'] = $data['START_DATE'];
+            $start_date = strtotime($data['START_DATE']);
+            $data['END_DATE'] = date(C\AD_DATE_FORMAT,
+                $start_date + (($data['DURATION'] - 1) * C\ONE_DAY));
+            $_REQUEST['END_DATE'] = $data['END_DATE'];
+            $this->initializeAdKeywords($data, $start_date, $data['DURATION']);
+        }
+        $user_id = $_SESSION['USER_ID'];
+        $is_admin = $role_model->checkUserRole($user_id, C\ADMIN_ROLE);
+        $data['HAS_ADMIN_ROLE'] = $is_admin;
+        $username = $signin_model->getUserName($user_id);
+        $data["USER"] = $user_model->getUser($username);
+        $data["USER_ID"] = $user_id;
+        $data['PAGING'] = "";
+        $search_array = [];
+        $arg = (isset($_REQUEST['arg'])) ? $parent->clean($_REQUEST['arg'],
+            "string") : "";
+        $data['BALANCE'] = $credit_model->getCreditBalance($user_id);
+        switch ($arg)
+        {
+            case "addadvertisement":
+                if ( isset($_REQUEST['PURCHASE'])) {
+                    $advertisement = [];
+                    $advertisement['USER_ID'] = $user_id;
+                    $fields = ["NAME", "DESCRIPTION",
+                        "DESTINATION", "BUDGET", "KEYWORDS",
+                        "START_DATE", "END_DATE"];
+                    foreach ($fields as $field) {
+                        if (isset($_REQUEST[$field])) {
+                            $advertisement[$field] = $data[$field];
+                        }
+                    }
+                    if (empty($_REQUEST['KEYWORDS'])) {
+                        return $parent->redirectWithMessage(
+                            tl('advertisement_component_enter_keywords'),
+                            array_merge(['arg'], $request_fields));
+                    }
+                    $ad_start_date = $data['START_DATE'];
+                    if ($advertisement["BUDGET"] < $data['AD_MIN_BID']) {
+                        return $parent->redirectWithMessage(
+                            tl('advertisement_component_bid_too_low'),
+                            array_merge(['arg'], $request_fields));
+                    }
+                    if ($data['BALANCE'] < $advertisement["BUDGET"]) {
+                        return $parent->redirectWithMessage(
+                            tl('advertisement_component_too_few_credits'),
+                            array_merge(['arg'], $request_fields));
+                    }
+                    $message = "";
+                    $strings_to_translate_for_model =
+                        [tl('advertisement_buy_ad')];
+                    $advertisement_model->addAdvertisement($advertisement,
+                        $data["AD_KEYWORDS"], $data['AD_MIN_BID'], $user_id);
+                    $credit_model->updateCredits($user_id,
+                        -$data["BUDGET"],
+                        'advertisement_buy_ad');
+                    $preserve = [];
+                    if (!empty($_REQUEST['context'])) {
+                        $_REQUEST['arg'] = 'search';
+                        $preserve[] = 'arg';
+                    }
+                    return $parent->redirectWithMessage(
+                        tl('advertisement_component_ad_created'),
+                        array_merge($preserve,
+                        ["start_row", "num_show", "end_row"]));
+                }
+                break;
+            case "changestatus":
+                if (isset($_REQUEST['id'])) {
+                    $ad = $advertisement_model->getAdvertisementById(
+                        $data['id']);
+                    if (empty($ad) || ($user_id != $ad['USER_ID'] &&
+                        !$is_admin) ) {
+                        break;
+                    }
+                    $user_ad_statuses = [C\ADVERTISEMENT_ACTIVE_STATUS,
+                        C\ADVERTISEMENT_DEACTIVATED_STATUS];
+                    $admin_ad_statuses = [C\ADVERTISEMENT_ACTIVE_STATUS,
+                        C\ADVERTISEMENT_SUSPENDED_STATUS];
+                    if ($user_id == $ad['USER_ID'] && !in_array(
+                        $data['status'], $user_ad_statuses)) {
+                        break;
+                    } else if ($user_id != $ad['USER_ID'] &&
+                        $is_admin && !in_array(
+                        $data['status'], $admin_ad_statuses)) {
+                        break;
+                    }
+                    $result = $advertisement_model->setAdvertisementStatus(
+                        $data['id'], $data['status']);
+                    if ($result) {
+                    $preserve = ["start_row", "end_row", "num_show"];
+                        if (!empty($_REQUEST['context'])) {
+                            $_REQUEST['arg'] = 'search';
+                            $preserve[] = 'arg';
+                        }
+                        return $parent->redirectWithMessage(tl(
+                            tl('advertisement_component_status_changed')),
+                            array_merge($preserve, $request_fields));
+                    }
+                }
+                break;
+            case "editadvertisement":
+                $data["FORM_TYPE"] = "editadvertisement";
+                $update = false;
+                if (isset($_REQUEST['save'])) {
+                    $update = true;
+                }
+                if (isset($_REQUEST['id'])) {
+                    $ad = $advertisement_model->getAdvertisementById(
+                        $data['id']);
+                    $ad_fields = ["NAME", "DESTINATION",
+                        "DESCRIPTION","BUDGET","KEYWORDS",
+                        "START_DATE", 'END_DATE'];
+                    if (!empty($ad) && ($user_id == $ad['USER_ID'] ||
+                        $is_admin)) {
+                        foreach ($ad_fields as $field) {
+                            $data[$field] = isset($data[$field])  ?
+                                $data[$field] : $ad[$field];
+                        }
+                        if ($is_admin) {
+                            $data['AD_USER_NAME'] = $user_model->getUsername(
+                                $ad['USER_ID']);
+                        }
+                        if ($update) {
+                            $updated_advertisement = [];
+                            $ad_update_fields = ["NAME",
+                                "DESCRIPTION","DESTINATION"];
+                            foreach ($ad_update_fields as $field) {
+                                if (isset($_REQUEST[$field])) {
+                                    $updated_advertisement[$field] =
+                                        $data[$field];
+                                }
+                            }
+                            $advertisement_model->updateAdvertisement(
+                                $updated_advertisement, $data['id']);
+                            foreach ($request_fields as $field) {
+                                unset($data[$field]);
+                            }
+                            unset($data['START_DATE']);
+                            unset($data['END_DATE']);
+                            return $parent->redirectWithMessage(
+                                tl('advertisement_component_ad_updated'),
+                                ["arg", "id", "start_row", "num_show",
+                                "end_row", "context"]);
+                        }
+                    }
+                }
+                break;
+            case "search":
+                $data["FORM_TYPE"] = "search";
+                $search_array =
+                    $parent->tableSearchRequestHandler($data,
+                        "manageAdvertisements",
+                        ['name', 'description', 'destination', 'keywords',
+                        'budget', 'start_date', 'end_date']);
+                if (empty($_SESSION['LAST_SEARCH']['manageAdvertisements']) ||
+                    isset($_REQUEST['name'])) {
+                    $_SESSION['LAST_SEARCH']['manageAdvertisements'] =
+                        $_SESSION['SEARCH']['manageAdvertisements'];
+                    unset($_SESSION['SEARCH']['manageAdvertisements']);
+                } else {
+                    $default_search = true;
+                }
+                break;
+        }
+        if ($search_array == [] || !empty($default_search)) {
+            if (!empty($_SESSION['LAST_SEARCH']['manageAdvertisements'])) {
+                if (!empty($_REQUEST['arg']) && $_REQUEST['arg'] == 'search') {
+                    $search_array =
+                        $parent->restoreLastSearchFromSession($data,
+                        'manageAdvertisements');
+                } else if (!empty($_REQUEST['context'])) {
+                    $search_array =
+                        $_SESSION['LAST_SEARCH']['manageAdvertisements'][
+                        'SEARCH_ARRAY'];
+                    $data['PAGING'] = $_SESSION['LAST_SEARCH'][
+                        'manageAdvertisements']['PAGING'];
+                }
+            }
+            if ($search_array == []) {
+                $search_array[] = ["id", "", "", "DESC"];
+            }
+        }
+        if (!C\MOBILE) {
+            $data['SCRIPT'] .= "\npreview(elt('ad-name'))\n" .
+                "preview(elt('ad-description'))\n".
+                "preview(elt('ad-destination'), 'ad-name')\n";
+        }
+        $parent->pagingLogic($data, $advertisement_model, "ADVERTISEMENTS",
+            C\DEFAULT_ADMIN_PAGING_NUM, $search_array, "",
+            ["USER_ID" => $user_id, "ADMIN" => $is_admin]);
+        return $data;
+    }
+    /**
+     * Sets up the $data['AD_KEYWORD'] as an associative array of
+     * (keyword, day) => bid_amounts based on min bid for that ad keyword on
+     * that day. Set up $data['EXPENSIVE_KEYWORD'] as the most expensive
+     * ad keyword for the dates in question and also sets up $data['AD_MIN_BID']
+     * as the minimum bid required for the dates in question
+     *
+     * @param array &$data associative array of data used by the view to
+     *      draw itself
+     * @param int $start_date state date in seconds since beginning of Unix
+     *  epoch
+     * @param int $day_count number of days ad campaign will last
+     */
+    public function initializeAdKeywords(&$data, $start_date, $day_count)
+    {
+        $parent = $this->parent;
+        $keywords = explode("," , strtoupper($data['KEYWORDS']));
+        array_walk($keywords, [C\NS_COMPONENTS .
+            "AdvertisementComponent", "trim_value"]);
+        $min_bid_reqd = 0;
+        $expensive_bid = 0;
+        foreach ($keywords as $keyword) {
+            $date = date(C\AD_DATE_FORMAT, $start_date);
+            $keyword_bid_amount = 0;
+            for ($k = 0; $k < $day_count; $k++) {
+                $bid_amount = $parent->model('advertisement')->getBidAmount(
+                    $keyword, $date);
+                $half_bid = ceil($bid_amount/2);
+                if ($bid_amount > C\AD_KEYWORD_INIT_BID ) {
+                    $min_bid_reqd += $half_bid;
+                    $data['AD_KEYWORDS'][$keyword][$date] =
+                        $half_bid;
+                    $keyword_bid_amount += $half_bid;
+                } else {
+                    $min_bid_reqd += $bid_amount;
+                    $data['AD_KEYWORDS'][$keyword][$date] =
+                        $half_bid;
+                    $keyword_bid_amount += $half_bid;
+                }
+                $date = date(C\AD_DATE_FORMAT, strtotime($date .' +1 day'));
+            }
+            if ($keyword_bid_amount >= $expensive_bid) {
+                $expensive_bid = $keyword_bid_amount;
+                $data['EXPENSIVE_KEYWORD'] = $keyword;
+            }
+        }
+        $data['AD_MIN_BID'] = $min_bid_reqd;
+    }
+    /**
+     * Trim white spaces callback for array_walk
+     *
+     * @param string& $value string to remove initial and trailing whitespace
+     *      from
+     */
+    public function trim_value(&$value)
+    {
+        $value = trim($value);
+    }
+}
--- a/src/controllers/components/Component.php
+++ b/src/controllers/components/Component.php
@ -0,0 +1,85 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\controllers\components;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * Translate the supplied arguments into the current locale.
+ *
+ * This function is a convenience copy of the same function
+ * @see seekquarry\yioop\library\tl() to this subnamespace
+ *
+ * @param string string_identifier  identifier to be translated
+ * @param mixed additional_args  used for interpolation in translated string
+ * @return string  translated string
+ */
+function tl()
+{
+    return call_user_func_array(C\NS_LIB . "tl", func_get_args());
+}
+/**
+ * shorthand for echo
+ *
+ * @param string $text string to send to the current output
+ */
+function e($text)
+{
+    echo $text;
+}
+/**
+ * Base component class for all components on
+ * the SeekQuarry site. A component consists of a collection of
+ * activities and their auxiliary methods that can be used by a controller
+ *
+ * @author Chris Pollett
+ */
+class Component
+{
+    /**
+     * Reference to the controller this component lives on
+     *
+     * @var object
+     */
+    public $parent = null;
+
+    /**
+     * Sets up this component by storing in its parent field a reference to
+     *  controller this component lives on
+     *
+     * @param object $parent_controller reference to the controller this
+     *      component lives on
+     */
+    public function __construct($parent_controller)
+    {
+        $this->parent = $parent_controller;
+    }
+}
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
--- a/src/controllers/components/SocialComponent.php
+++ b/src/controllers/components/SocialComponent.php
--- a/src/controllers/components/SystemComponent.php
+++ b/src/controllers/components/SystemComponent.php
--- a/src/css/editor.css
+++ b/src/css/editor.css
@ -0,0 +1,215 @@
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Eswara Rajesh Pinapala
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+/*
+    editor.css
+    Stylesheet for text-area editor associated with editing wiki pages
+ */
+.wiki-editor
+{
+    display: block;
+    margin: 0 auto;
+}
+.wiki-editor div
+{
+    padding: 5px;
+}
+.wiki-buttons
+{
+    margin-bottom:2px;
+}
+.wiki-buttons input
+{
+    border-style: solid;
+    cursor: pointer;
+    display: inline-block;
+    height: 36px;
+    vertical-align: top;
+    width: 36px;
+}
+.wiki-buttons select
+{
+    background: lightgray;
+    border: 1px solid #EEE;
+    font-size:14pt;
+    height:36px;
+    margin-top: 2px;
+}
+
+.wiki-popup-prompt
+{
+    background-color:gray;
+    display: none;
+    height:100%;
+    left: 0;
+    right:0;
+    opacity:0.95;
+    position:fixed;
+    top: 0;
+    width:100%;
+    z-index:100;
+}
+
+.wiki-popup-content
+{
+    background-color: white;
+    border: 16px solid #8A4;
+    height: 3in;
+    left: 20%;
+    right: 20%;
+    overflow: auto;
+    padding: 16px;
+    position:fixed;
+    top: 20%;
+    width: 5in;
+    z-index:101;
+
+}
+.wiki-popup-content h2
+{
+    padding:30px;
+}
+.wiki-popup-content div
+{
+    padding:15px;
+}
+.wiki-popup-content select
+{
+    font-size: 18pt;
+}
+.wiki-popup-content table
+{
+    margin: auto;
+}
+
+.wikibtn-bold
+{
+    background: url(../resources/wiki_button_images/wikibtn-bold.png) no-repeat;
+}
+.wikibtn-underline
+{
+    background: url(../resources/wiki_button_images/wikibtn-underline.png)
+        no-repeat;
+}
+.wikibtn-bullets
+{
+    background: url(../resources/wiki_button_images/wikibtn-bullets.png)
+        no-repeat;
+}
+.wikibtn-heading
+{
+    background: url(../resources/wiki_button_images/wikibtn-heading.png)
+        no-repeat;
+}
+.wikibtn-hr
+{
+    background: url(../resources/wiki_button_images/wikibtn-hr.png) no-repeat;
+}
+.wikibtn-hyperlink
+{
+    background: url(../resources/wiki_button_images/wikibtn-hyperlink.png)
+        no-repeat;
+}
+.wikibtn-italic
+{
+    background: url(../resources/wiki_button_images/wikibtn-italic.png)
+        no-repeat;
+}
+.wikibtn-nowiki
+{
+    background: url(../resources/wiki_button_images/wikibtn-nowiki.png)
+        no-repeat;
+}
+.wikibtn-numbers
+{
+    background:url(../resources/wiki_button_images/wikibtn-numbers.png)
+        no-repeat;
+}
+.wikibtn-strike
+{
+    background: url(../resources/wiki_button_images/wikibtn-strike.png)
+        no-repeat;
+}
+.wikibtn-table
+{
+    background: url(../resources/wiki_button_images/wikibtn-table.png)
+        no-repeat;
+}
+.wikibtn-search-widget
+{
+    background: url(../resources/wiki_button_images/wikibtn-search-widget.png)
+        no-repeat;
+}
+
+.wikibtn-slide
+{
+    background: url(../resources/wiki_button_images/wikibtn-slide.png)
+        no-repeat;
+}
+
+.wikibtn-definitionlist
+{
+    background: url(../resources/wiki_button_images/wikibtn-definitionlist.png)
+        no-repeat;
+}
+.wikibtn-centeraligned
+{
+    background: url(../resources/wiki_button_images/wikibtn-centeraligned.png)
+        no-repeat;
+}
+.wikibtn-rightaligned
+{
+    background: url(../resources/wiki_button_images/wikibtn-rightaligned.png)
+        no-repeat;
+}
+.wikibtn-leftaligned
+{
+    background: url(../resources/wiki_button_images/wikibtn-leftaligned.png)
+        no-repeat;
+}
+.wikibtn-bold:hover,
+.wikibtn-bullets:hover,
+.wikibtn-heading:hover,
+.wikibtn-hr:hover,
+.wikibtn-hyperlink:hover,
+.wikibtn-italic:hover,
+.wikibtn-nowiki:hover,
+.wikibtn-numbers:hover,
+.wikibtn-search-widget:hover,
+.wikibtn-strike:hover,
+.wikibtn-table:hover,
+.wikibtn-underline:hover,
+.wikibtn-slide:hover,
+.wikibtn-definitionlist:hover,
+.wikibtn-rightaligned:hover,
+.wikibtn-leftaligned:hover,
+.wikibtn-centeraligned:hover
+{
+    background-position: 0 -36px;
+}
--- a/src/css/search.css
+++ b/src/css/search.css
--- a/src/css/slidy.css
+++ b/src/css/slidy.css
@ -0,0 +1,402 @@
+/* slidy.css
+
+   Copyright (c) 2005-2010 W3C (MIT, ERCIM, Keio), All Rights Reserved.
+   W3C liability, trademark, document use and software licensing
+   rules apply, see:
+
+   http://www.w3.org/Consortium/Legal/copyright-documents
+   http://www.w3.org/Consortium/Legal/copyright-software
+*/
+body
+{
+  margin: 0 0 0 0;
+  padding: 0 0 0 0;
+  width: 100%;
+  height: 100%;
+  color: black;
+  background-color: white;
+  font-family: "Gill Sans MT", "Gill Sans", GillSans, sans-serif;
+  font-size: 14pt;
+}
+
+div.toolbar {
+  position: fixed; z-index: 200;
+  top: auto; bottom: 0; left: 0; right: 0;
+  height: 1.2em; text-align: right;
+  padding-left: 1em;
+  padding-right: 1em;
+  font-size: 60%;
+  color: red;
+  background-color: rgb(240,240,240);
+  border-top: solid 1px rgb(180,180,180);
+}
+
+div.toolbar span.copyright {
+  color: black;
+  margin-left: 0.5em;
+}
+
+div.initial_prompt {
+  position: absolute;
+  z-index: 1000;
+  bottom: 1.2em;
+  width: 100%;
+  background-color: rgb(200,200,200);
+  opacity: 0.35;
+  background-color: rgba(200,200,200, 0.35);
+  cursor: pointer;
+}
+
+div.initial_prompt p.help {
+  text-align: center;
+}
+
+div.initial_prompt p.close {
+  text-align: right;
+  font-style: italic;
+}
+
+div.slidy_toc {
+  position: absolute;
+  z-index: 300;
+  width: 60%;
+  max-width: 30em;
+  height: 30em;
+  overflow: auto;
+  top: auto;
+  right: auto;
+  left: 4em;
+  bottom: 4em;
+  padding: 1em;
+  background: rgb(240,240,240);
+  border-style: solid;
+  border-width: 2px;
+  font-size: 60%;
+}
+
+div.slidy_toc .toc_heading {
+  text-align: center;
+  width: 100%;
+  margin: 0;
+  margin-bottom: 1em;
+  border-bottom-style: solid;
+  border-bottom-color: rgb(180,180,180);
+  border-bottom-width: 1px;
+}
+
+div.slide {
+  z-index: 20;
+  margin: 0 0 0 0;
+  padding-top: 0;
+  padding-bottom: 0;
+  padding-left: 20px;
+  padding-right: 20px;
+  border-width: 0;
+  clear: both;
+  top: 0;
+  bottom: 0;
+  left: 0;
+  right: 0;
+  line-height: 120%;
+  background-color: transparent;
+}
+
+div.background {
+  display: none;
+}
+
+div.handout {
+  margin-left: 20px;
+  margin-right: 20px;
+}
+
+div.slide.titlepage {
+  text-align: center;
+}
+
+div.slide.titlepage h1 {
+  padding-top: 10%;
+  margin-right: 0;
+}
+
+div.slide h1 {
+  padding-left: 0;
+  padding-right: 20pt;
+  padding-top: 4pt;
+  padding-bottom: 4pt;
+  margin-top: 0;
+  margin-left: 0;
+  margin-right: 60pt;
+  margin-bottom: 0.5em;
+  display: block;
+  font-size: 160%;
+  line-height: 1.2em;
+  background: transparent;
+}
+
+@media screen and (max-device-width: 1024px)
+{
+  div.slide { font-size: 100%; }
+}
+
+@media screen and (max-device-width: 800px)
+{
+  div.slide { font-size: 200%; }
+  div.slidy_toc {
+    top: 1em;
+    left: 1em;
+    right: auto;
+    width: 80%;
+    font-size: 180%;
+  }
+}
+
+div.toc-heading {
+  width: 100%;
+  border-bottom: solid 1px rgb(180,180,180);
+  margin-bottom: 1em;
+  text-align: center;
+}
+
+img {
+  image-rendering: optimize-quality;
+}
+
+pre {
+ font-size: 80%;
+ font-weight: bold;
+ line-height: 120%;
+ padding-top: 0.2em;
+ padding-bottom: 0.2em;
+ padding-left: 1em;
+ padding-right: 1em;
+ border-style: solid;
+ border-left-width: 1em;
+ border-top-width: thin;
+ border-right-width: thin;
+ border-bottom-width: thin;
+ border-color: #95ABD0;
+ color: #00428C;
+ background-color: #E4E5E7;
+}
+
+li pre { margin-left: 0; }
+
+blockquote { font-style: italic }
+
+img { background-color: transparent }
+
+p.copyright { font-size: smaller }
+
+.center { text-align: center }
+.footnote { font-size: smaller; margin-left: 2em; }
+
+a img { border-width: 0; border-style: none }
+
+a:visited { color: navy }
+a:link { color: navy }
+a:hover { color: red; text-decoration: underline }
+a:active { color: red; text-decoration: underline }
+
+a {text-decoration: none}
+.toolbar a:link {color: blue}
+.toolbar a:visited {color: blue}
+.toolbar a:active {color: red}
+.toolbar a:hover {color: red}
+
+ul { list-style-type: square; }
+ul ul { list-style-type: disc; }
+ul ul ul { list-style-type: circle; }
+ul ul ul ul { list-style-type: disc; }
+li { margin-left: 0.5em; margin-top: 0.5em; }
+li li { font-size: 85%; font-style: italic }
+li li li { font-size: 85%; font-style: normal }
+
+div dt
+{
+  margin-left: 0;
+  margin-top: 1em;
+  margin-bottom: 0.5em;
+  font-weight: bold;
+}
+div dd
+{
+  margin-left: 2em;
+  margin-bottom: 0.5em;
+}
+
+
+p,pre,ul,ol,blockquote,h2,h3,h4,h5,h6,dl,table {
+  margin-left: 1em;
+  margin-right: 1em;
+}
+
+p.subhead { font-weight: bold; margin-top: 2em; }
+
+.smaller { font-size: smaller }
+.bigger { font-size: 130% }
+
+td,th { padding: 0.2em }
+
+ul {
+  margin: 0.5em 1.5em 0.5em 1.5em;
+  padding: 0;
+}
+
+ol {
+  margin: 0.5em 1.5em 0.5em 1.5em;
+  padding: 0;
+}
+
+ul { list-style-type: square; }
+ul ul { list-style-type: disc; }
+ul ul ul { list-style-type: circle; }
+ul ul ul ul { list-style-type: disc; }
+
+ul li {
+  list-style: square;
+  margin: 0.1em 0em 0.6em 0;
+  padding: 0 0 0 0;
+  line-height: 140%;
+}
+
+ol li {
+  margin: 0.1em 0em 0.6em 1.5em;
+  padding: 0 0 0 0px;
+  line-height: 140%;
+  list-style-type: decimal;
+}
+
+li ul li {
+  font-size: 85%;
+  font-style: italic;
+  list-style-type: disc;
+  background: transparent;
+  padding: 0 0 0 0;
+}
+li li ul li {
+  font-size: 85%;
+  font-style: normal;
+  list-style-type: circle;
+  background: transparent;
+  padding: 0 0 0 0;
+}
+li li li ul li {
+  list-style-type: disc;
+  background: transparent;
+  padding: 0 0 0 0;
+}
+
+li ol li {
+  list-style-type: decimal;
+}
+
+
+li li ol li {
+  list-style-type: decimal;
+}
+
+/*
+ setting class="outline on ol or ul makes it behave as an
+ ouline list where blocklevel content in li elements is
+ hidden by default and can be expanded or collapsed with
+ mouse click. Set class="expand" on li to override default
+*/
+
+ol.outline li:hover { cursor: pointer }
+ol.outline li.nofold:hover { cursor: default }
+
+ul.outline li:hover { cursor: pointer }
+ul.outline li.nofold:hover { cursor: default }
+
+ol.outline { list-style:decimal; }
+ol.outline ol { list-style-type:lower-alpha }
+
+ol.outline li.nofold {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/nofold-dim.gif) no-repeat 0px 0.5em;
+}
+ol.outline li.unfolded {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/fold-dim.gif) no-repeat 0px 0.5em;
+}
+ol.outline li.folded {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/unfold-dim.gif) no-repeat 0px 0.5em;
+}
+ol.outline li.unfolded:hover {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/fold.gif) no-repeat 0px 0.5em;
+}
+ol.outline li.folded:hover {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/unfold.gif) no-repeat 0px 0.5em;
+}
+
+ul.outline li.nofold {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/nofold-dim.gif) no-repeat 0px 0.5em;
+}
+ul.outline li.unfolded {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/fold-dim.gif) no-repeat 0px 0.5em;
+}
+ul.outline li.folded {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/unfold-dim.gif) no-repeat 0px 0.5em;
+}
+ul.outline li.unfolded:hover {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/fold.gif) no-repeat 0px 0.5em;
+}
+ul.outline li.folded:hover {
+  padding: 0 0 0 20px;
+  background: transparent url(../graphics/unfold.gif) no-repeat 0px 0.5em;
+}
+
+/* for slides with class "title" in table of contents */
+a.titleslide { font-weight: bold; font-style: italic }
+
+/*
+ hide images for work around for save as bug
+ where browsers fail to save images used by CSS
+*/
+img.hidden { display: none; visibility: hidden }
+div.initial_prompt { display: none; visibility: hidden }
+
+  div.slide {
+     visibility: visible;
+     position: inherit;
+  }
+  div.handout {
+     border-top-style: solid;
+     border-top-width: thin;
+     border-top-color: black;
+  }
+
+@media screen {
+  .hidden { display: none; visibility: visible }
+
+  div.slide.hidden { display: block; visibility: visible }
+  div.handout.hidden { display: block; visibility: visible }
+  div.background { display: none; visibility: hidden }
+  body.single_slide div.initial_prompt { display: block; visibility: visible }
+  body.single_slide div.background { display: block; visibility: visible }
+  body.single_slide div.background.hidden { display: none; visibility: hidden }
+  body.single_slide .invisible { visibility: hidden }
+  body.single_slide .hidden { display: none; visibility: hidden }
+  body.single_slide div.slide { position: absolute }
+  body.single_slide div.handout { display: none; visibility: hidden }
+}
+
+@media print {
+  .hidden { display: block; visibility: visible }
+
+  div.slide pre { font-size: 60%; padding-left: 0.5em; }
+  div.toolbar { display: none; visibility: hidden; }
+  div.slidy_toc { display: none; visibility: hidden; }
+  div.background { display: none; visibility: hidden; }
+  div.slide { page-break-before: always }
+  /* :first-child isn't reliable for print media */
+  div.slide.first-slide { page-break-before: avoid }
+}
--- a/src/data/default.db
+++ b/src/data/default.db
--- a/src/error.php
+++ b/src/error.php
@ -0,0 +1,58 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Web page used to HTTP display error pages for
+ * the SeekQuarry/Yioop Search engine
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop;
+
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\controllers\StaticController;
+
+if (!isset($_REQUEST['p']) ||
+    !in_array($_REQUEST['p'], ["404", "409"])) {
+    $_REQUEST['p'] = "404";
+}
+switch ($_REQUEST['p']) {
+    case "404":
+        header("HTTP/1.0 404 Not Found");
+        break;
+    case "409":
+        header("HTTP/1.0 409 Conflict");
+        break;
+}
+$_REQUEST['c'] = "static";
+define('seekquarry\\yioop\\configs\\SKIP_BOOTSTRAP', true);
+/**
+ * load in main entry point
+ */
+require_once(__DIR__."/index.php");
+bootstrap();
+exit();
--- a/src/examples/0-Archive1421025145.zip
+++ b/src/examples/0-Archive1421025145.zip
--- a/src/examples/IndexData1421025145.zip
+++ b/src/examples/IndexData1421025145.zip
--- a/src/examples/QueryCacher.php
+++ b/src/examples/QueryCacher.php
@ -0,0 +1,82 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\examples;
+
+/**
+ * Script to cache run a sequence of queries against a yioop instance
+ * so that they can be cached
+ */
+define("YIOOP_URL", "http://localhost/");
+define("TIME_BETWEEN_REQUEST_IN_SECONDS", 5);
+define("QUERY_AGENT_NAME", "QUERY_CACHER");
+if (empty($argv[1])) {
+    echo <<< EOD
+QUERY_CACHER
+============
+This program runs a sequence of queries against a Yioop Installation.
+If file caching is turned on for that Yioop Installation, then those query
+will be saved to its cache. To run this program, type a command like:
+php QueryCacher.php file_name.txt
+Here file_name.txt is the name of a text file with one query/line.
+EOD;
+exit();
+} else {
+    echo <<< EOD
+QUERY_CACHER
+============
+Now running a sequence of queries against the yioop installation at:
+
+EOD;
+    echo YIOOP_URL ."\n\n";
+}
+$queries = file($argv[1]);
+$agent = curl_init();
+curl_setopt($agent, CURLOPT_USERAGENT, QUERY_AGENT_NAME);
+curl_setopt($agent, CURLOPT_AUTOREFERER, true);
+curl_setopt($agent, CURLOPT_FOLLOWLOCATION, true);
+curl_setopt($agent, CURLOPT_SSL_VERIFYHOST, 0);
+curl_setopt($agent, CURLOPT_SSL_VERIFYPEER, false);
+curl_setopt($agent, CURLOPT_NOSIGNAL, true);
+curl_setopt($agent, CURLOPT_RETURNTRANSFER, true);
+curl_setopt($agent, CURLOPT_FAILONERROR, true);
+curl_setopt($agent, CURLOPT_TIMEOUT, TIME_BETWEEN_REQUEST_IN_SECONDS);
+curl_setopt($agent, CURLOPT_CONNECTTIMEOUT,
+    TIME_BETWEEN_REQUEST_IN_SECONDS);
+curl_setopt($agent, CURLOPT_HTTPHEADER, ['Expect:']);
+$i = 1;
+foreach ($queries as $query) {
+    echo $i . " ". $query;
+    curl_setopt($agent, CURLOPT_URL, YIOOP_URL . "?q=". urlencode($query));
+    $response = curl_exec($agent);
+    $i++;
+    sleep(TIME_BETWEEN_REQUEST_IN_SECONDS);
+}
+curl_close($agent);
--- a/src/examples/SearchApi.php
+++ b/src/examples/SearchApi.php
@ -0,0 +1,201 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\examples;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\controllers\SearchController;
+/**
+ * This file contains an example script to show the different
+ * methods of the Yioop! search api
+ */
+// this example should be only run from the command-line
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+/** Calculate base directory of script @ignore
+ * If you have Yioop! in a separate folder from your web-site
+ * You should change BASE_DIR to the location of the Yioop! directory
+ */
+define("seekquarry\\yioop\\configs\\PARENT_DIR",
+    substr(dirname(realpath($_SERVER['PHP_SELF'])), 0,
+    -strlen("/src/examples")));
+define("seekquarry\\yioop\\configs\\BASE_DIR", C\PARENT_DIR . "/src");
+/** Load in global configuration settings; you need this*/
+require_once C\BASE_DIR.'/configs/Config.php';
+if (!C\PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+/*
+ * We now move the search API test index over to the WORK_DIRECTORY
+ * if it isn't already there. In a real-world set-up a user would have
+ * put a crawl into the WORK_DIRECTORY and that would be used to make the
+ * query.
+ */
+$archive_timestamp = "1421025145";
+$archive = C\BASE_DIR."/examples/0-Archive$archive_timestamp.zip";
+$index_archive = C\BASE_DIR."/examples/IndexData$archive_timestamp.zip";
+$extract_folder = C\CRAWL_DIR."/cache";
+if (!file_exists($archive) ||
+   !file_exists($index_archive)) {
+   echo "\nSearch API test index doesn't exist, so can't run demo\n\n";
+   exit();
+}
+if (class_exists("\ZipArchive")) {
+    $zip = new \ZipArchive();
+    $zip_h = $zip->open($archive);
+    $zip->extractTo($extract_folder);
+    $zip->close();
+    $zip_h = $zip->open($index_archive);
+    $zip->extractTo($extract_folder);
+    $zip->close();
+} else {
+    exec("unzip $archive -d $extract_folder");
+    exec("unzip $index_archive -d $extract_folder");
+}
+// Create a SearchController to do queries with
+$controller = new SearchController();
+/*
+  Now we can do queries! First do a simple search on art and print the results
+ */
+echo "\n\n\nAn example of a query request with the search API:\n";
+echo "Total rows numbers are high because by default grouping is done.\n";
+
+$query = "art i:$archive_timestamp";
+    /* i:1317414322 is the timestamp of the index to use.
+       API requires that a default index be set even though the query might
+       specify to use a different one. The query string we pass to the
+       API can be anything you can type into Yioop! search box.
+     */
+$num_results = 10; // how many results to get back
+$first_result_to_return = 0;
+    // what ranked results show be the first to be returned (0 = highest ranked)
+$data = $controller->queryRequest($query, $num_results,
+    $first_result_to_return);
+outputQueryData($data);
+
+/*
+   next we do a related search (as our index only has one page in it)
+   the only related page is the page itself
+ */
+echo "\n\n\nAn example of making a related query request with the search API\n";
+$url = "http://www.ucanbuyart.com/";
+$num_results = 10; // how many results to get back
+$first_result_to_return = 0;
+$index_timestamp = $archive_timestamp;
+$data = $controller->relatedRequest($url, $num_results,
+    $first_result_to_return, $index_timestamp);
+outputQueryData($data);
+/*
+   Finally, we give an example of requesting the cached version of
+   a downloaded page...
+ */
+echo "\n\n\nAn example of making a cached of page request".
+    " with the search API:\n";
+$url = "http://www.ucanbuyart.com/";
+$ui_flags = [];
+$search_terms = "art classifieds"; // these words will be highlighted
+$index_timestamp = $archive_timestamp;
+$data = $controller->cacheRequest($url, $ui_flags,
+    $search_terms, $index_timestamp);
+echo $data;
+/*
+  We now delete the example index to clean-up our test. In real-life
+  you wouldn't want to delete your query index after making one query
+*/
+unlinkRecursive(C\CRAWL_DIR."/cache/0-Archive$archive_timestamp");
+unlinkRecursive(C\CRAWL_DIR."/cache/IndexData$archive_timestamp");
+// demo over, bye-bye for now!
+exit();
+/**
+ * Short function to pretty-print the data gotten back from a Yioop! query
+ * @param array $data  what we got back from doing a query
+ */
+function outputQueryData($data)
+{
+    // Now to print out info in the result
+    foreach ($data['PAGES'] as $page) {
+        echo "============\n";
+        echo "TITLE: ". trim($page[CrawlConstants::TITLE]). "\n";
+        echo "URL: ". trim($page[CrawlConstants::URL]). "\n";
+        echo "DESCRIPTION:".
+            wordwrap(trim($page[CrawlConstants::DESCRIPTION]))."\n";
+        echo "Rank: ".$page[CrawlConstants::DOC_RANK]."\n";
+        echo "Relevance: ".$page[CrawlConstants::RELEVANCE]."\n";
+        echo "Proximity: ".$page[CrawlConstants::PROXIMITY]."\n";
+        echo "Score: ".$page[CrawlConstants::SCORE]."\n";
+        echo "============\n\n";
+    }
+
+    echo "QUERY STATISTICS\n";
+    echo "============\n";
+    echo "LOW: ".$data['LIMIT']."\n";
+    echo "HIGH: ".min($data['TOTAL_ROWS'],
+        $data['LIMIT'] + $data['RESULTS_PER_PAGE'])."\n";
+    echo "TOTAL ROWS: ".$data['TOTAL_ROWS']."\n";
+}
+/**
+ * Recursively delete a directory
+ *
+ * @param string $dir Directory name
+ * @param boolean $deleteRootToo Delete specified top directory as well
+ */
+function unlinkRecursive($dir, $deleteRootToo = true)
+{
+    traverseDirectory($dir, C\NS_LIB . "deleteFileOrDir", $deleteRootToo);
+}
+/**
+ * Recursively traverse a directory structure and call a callback function
+ *
+ * @param string $dir Directory name
+ * @param function $callback Function to call as traverse structure
+ * @param boolean $rootToo do op on top-level directory as well
+ */
+function traverseDirectory($dir, $callback, $rootToo = true)
+{
+    if (!$dh = @opendir($dir)) {
+        return;
+    }
+    while (false !== ($obj = readdir($dh))) {
+        if ($obj == '.' || $obj == '..') {
+            continue;
+        }
+        if (is_dir($dir . '/' . $obj)) {
+            traverseDirectory($dir.'/'.$obj, $callback, true);
+        }
+        @$callback($dir . '/' . $obj);
+    }
+    closedir($dh);
+    if ($rootToo) {
+        @$callback($dir);
+    }
+}
--- a/src/examples/WeatherBot.php
+++ b/src/examples/WeatherBot.php
@ -0,0 +1,149 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Harika Nukala harika.nukala@sjsu.edu
+ * @package seek_quarry
+ * @subpackage examples
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\examples\weatherbot;
+
+/**
+ * This class demonstrates a simple Weather Chat Bot using the Yioop
+ * ChatBot APIs for Yioop Discussion Groups.
+ * To use this bot:
+ * (1) Move this file to some folder of a web server you have access to.
+ *     Denote by some_url the url of this folder. If you point your
+ *     browser at this folder you should see a message that begins with:
+ *     There was a configuration issue with your query.
+ * (2) Create a new Yioop User.
+ * (3) Under Manage Accounts, click on the lock symbol next to Account Details
+ * (4) Check the Bot User check bot, click save.
+ * (5) Two form variables should appear: Bot Unique Token and Bot Callback URL.
+ *      Fill in a value for Bot Unique Token that matches the value set
+ *      for ACCESS_TOKEN in the code within the WeatherBot class.
+ *      Fill in some_url (as defined in step (1)) for the value of Bot Callback
+ *      URL
+ * (6) Add the the user you created in Yioop to the group that you would like
+ *     the bot to service. Let the name of this user be user_name.
+ * (7) Talk to your bot in yioop in this groups by commenting on an
+ *     already existing thread with a message beginning with @user_name.
+ */
+class WeatherBot
+{
+    /**
+     * Url of site that this bot gets weather information from
+     */
+    const WEATHER_URL = "http://query.yahooapis.com/v1/public/yql";
+    /**
+     * Token given when setting up the bot in Yioop  for callback requests
+     * This bots checks that a request from a Yioop Intance  sends
+     * a timestamp as well as the hash of this timestamp with the bot_token
+     * and post data and that these match the expected values
+     */
+    const ACCESS_TOKEN = "bot_token";
+    /**
+     * Number of seconds that the passed timestamp can differ from the current
+     * time on the WeatherBot machine.
+     */
+    const TIME_WINDOW = 60;
+    /**
+     * This is the method called to get the WeatherBot to handle an incoming
+     * HTTP request, and echo a weather realted message
+     */
+    function processRequest()
+    {
+        $result = "There was a configuration issue with your query.";
+        if ($this->checkBotToken() && !empty($_REQUEST['post']) &&
+            !empty($_REQUEST['bot_name'])) {
+            $location = filter_var($_REQUEST['post'], \FILTER_SANITIZE_STRING);
+            $location = trim(mb_strtolower($location));
+            $result = $this->getWeather($location);
+            if (empty($result)) {
+                $result = "I failed to find the weather for that location.\n".
+                    "I respond to queries in the format:\n" .
+                    " @{$_REQUEST['bot_name']} some_location";
+            }
+        }
+        echo $result;
+    }
+    /**
+     * This method is used to check a request that it comes from a site
+     * that knows the bot_token in use by this WeatherBot.
+     */
+    function checkBotToken()
+    {
+        if (!empty($_REQUEST['bot_token'])) {
+            $token_parts = explode("*", $_REQUEST['bot_token']);
+            $post = empty($_REQUEST["post"]) ? "" : $_REQUEST["post"];
+            $hash = hash("sha256", self::ACCESS_TOKEN . $token_parts[1].
+                $post);
+            if (isset($token_parts[1]) &&
+                abs(time() - $token_parts[1]) < self::TIME_WINDOW) {
+                // second check avoids timing attacks, works for > php 5.6
+                if ((!function_exists('hash_equals') &&
+                    $hash == $token_parts[0]) ||
+                    hash_equals($hash, $token_parts[0])) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+    /**
+     * Get weather information about a location
+     *
+     * @param string $location the location to get weather updates for
+     * @return string weather information
+     */
+    function getWeather($location)
+    {
+        $yql_query = "select * from weather.forecast where woeid in
+            (select woeid from geo.places(1) where text='" . $location
+            ."')";
+        $url = self::WEATHER_URL . "?q=" .
+            urlencode($yql_query) . "&format=json";
+        $ch = curl_init();
+        curl_setopt($ch, CURLOPT_URL, $url);
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+        $data = curl_exec($ch);
+        curl_close($ch);
+        $result = @json_decode($data);
+        $temp = empty($result->query->results->channel->item->condition->temp) ?
+            "" : $result->query->results->channel->item->condition->temp;
+        $text = empty($result->query->results->channel->item->condition->text) ?
+            "" : mb_strtolower(
+            $result->query->results->channel->item->condition->text);
+        if (empty($temp) || empty($text)) {
+            return "";
+        }
+        return "The weather is $temp and $text in $location.";
+    }
+}
+$bot = new WeatherBot();
+$bot->processRequest();
+
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
--- a/src/executables/ClassifierTool.php
+++ b/src/executables/ClassifierTool.php
@ -0,0 +1,734 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\executables;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\controllers\ClassifierController;
+use seekquarry\yioop\library\classifiers\Classifier;
+
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+/** Load in global configuration settings */
+require_once __DIR__.'/../configs/Config.php';
+if (!C\PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+/**
+ * Immediately throw an exception for all notices and warnings, rather than
+ * letting execution continue.
+ * @ignore
+ */
+function handleError($errno, $err_str, $err_file, $err_line)
+{
+    if (error_reporting() == 0) {
+        // Error suppressed by @, so ignore.
+        return;
+    }
+    $msg = "$err_str in $err_file on line $err_line";
+    if ($errno == E_NOTICE || $errno == E_WARNING) {
+        throw new \ErrorException($msg, $errno);
+    } else {
+        echo $msg;
+    }
+}
+set_error_handler(C\NS_LIB . 'classifiers\\handleError');
+
+/**
+ * Instructions for how to use classifier tool
+ * @var string
+ */
+$INSTRUCTIONS = <<<EOD
+
+This tool is used to automate the building and testing of classifiers,
+providing an alternative to the web interface when a labeled training set is
+available.
+
+ClassifierTool.php takes an activity to perform, the name of a dataset to use,
+and a label for the constructed classifier. The activity is the name of one
+of the 'run*' functions implemented by this class, without the common 'run'
+prefix (e.g., 'TrainAndTest'). The dataset is specified as the common prefix
+of two indexes that have the suffixes "Pos" and "Neg", respectively.  So if
+the prefix were "DATASET", then this tool would look for the two existing
+indexes "DATASET Pos" and "DATASET Neg" from which to draw positive and
+negative examples. Each document in these indexes should be a positive or
+negative example of the target class, according to whether it's in the "Pos"
+or "Neg" index. Finally, the label is just the label to be used for the
+constructed classifier.
+
+Beyond these options (set with the -a, -d, and -l flags), a number of other
+options may be set to alter parameters used by an activity or a classifier.
+These options are set using the -S, -I, -F, and -B flags, which correspond
+to string, integer, float, and boolean parameters respectively. These flags
+may be used repeatedly, and each expects an argument of the form NAME=VALUE,
+where NAME is the name of a parameter, and VALUE is a value parsed according
+to the flag. The NAME should match one of the keys of the options member of
+this class, where a period ('.') may be used to specify nesting.  For
+example:
+
+    -I debug=1         # set the debug level to 1
+    -B cls.use_nb=0    # tell the classifier to use Naive Bayes
+
+To build and evaluate a classifier for the label 'spam', trained using the
+two indexes "DATASET Neg" and "DATASET Pos", and a maximum of the top 25
+most informative features:
+
+php ClassifierTool.php -a TrainAndTest -d 'DATASET' -l 'spam'
+    -I cls.chi2.max=25
+
+The above assume we are in the folder of ClassifierTool.php
+EOD;
+
+/*
+ * We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+
+/**
+ * Class used to encapsulate all the activities of the ClassifierTool.php
+ * command line script. This script allows one to automate the building and
+ * testing of classifiers, providing an alternative to the web interface when
+ *
+ * a labeled training set is available.
+ * @author Shawn Tice
+ */
+class ClassifierTool
+{
+    /**
+     * Reference to a classifier controller, used to manipulate crawl mixes in
+     * the same way that the controller that handles web requests does.
+     * @var object
+     */
+    protected $classifier_controller;
+
+    /**
+     * Reference to a crawl model object, also used to manipulate crawl mixes.
+     * @var object
+     */
+    protected $crawl_model;
+    /**
+     * Options to be used by activities and constructed classifiers. These
+     * options can be overridden by supplying an appropriate flag on the
+     * command line, where nesting is denoted by a period (e.g., cls.chi2.max).
+     * The supported options are:
+     *
+     *   debug: An integer, the level of debug statements to print. Larger
+     *       integers specify more detailed debug output; the default value of
+     *       0 indicates no debug output.
+     *
+     *   max_train: An integer, the maximum number of examples to use when
+     *       training a classifier. The default value of null indicates that
+     *       all available training examples should be used.
+     *
+     *   test_interval: An integer, the number of new training examples to be
+     *       added before a round of testing on ALL test instances is to be
+     *       executed. With an interval of 5, for example, after adding five
+     *       new training examples, the classifier would be finalized and used
+     *       to classify all test instances. The error is reported for each
+     *       round of testing. The default value of null indicates that
+     *       testing should only occur after all training examples have been
+     *       added.
+     *
+     *   split: An integer, the number of examples from the entire set of
+     *       labeled examples to use for training. The remainder are used for
+     *       testing.
+     *
+     *   cls.use_nb: A boolean, whether or not to use the Naive Bayes
+     *       classification algorithm instead of the logistic regression one
+     *       in order to finalize the classifier.  The default value is false,
+     *       indicating that logistic regression should be used.
+     *
+     *   cls.chi2.max: An integer, the maximum number of features to use when
+     *       training the classifier.  The default is a relatively
+     *       conservative 200.
+     *
+     * @var array
+     */
+    public $options = [
+        'debug' => 0,
+        'max_train' => null,
+        'test_interval' => null,
+        'split' => 3000,
+        'cls' => [
+            'use_nb' => false,
+            'chi2' => [
+                'max' => 200
+                ]
+            ]
+        ];
+    /**
+     * Initializes the classifier controller and crawl model that will be used
+     * to manage crawl mixes, used for iterating over labeled examples.
+     */
+    public function __construct()
+    {
+        $this->classifier_controller = new ClassifierController();
+        $this->crawl_model = $this->classifier_controller->model("crawl");
+    }
+    /**
+     * Parses the command-line options, returns the required arguments, and
+     * updates the member variable $options with any parameters. If any of the
+     * required arguments (activity, dataset, or label) are missing, then a
+     * message is printed and the program exits. The optional arguments used to
+     * set parameters directly modify the class state through the setOptions
+     * method.
+     *
+     * @return array the parsed activity, dataset, and label
+     */
+    public function parseOptions()
+    {
+        $shortopts = 'l:a:d:S:I:F:B:';
+        $options = getopt($shortopts);
+        if (!isset($options['a'])) {
+            echo "missing -a flag to choose activity to run\n";
+            exit(1);
+        }
+        if (!isset($options['l'])) {
+            echo "missing -l flag to set classifier label\n";
+            exit(1);
+        }
+        if (!isset($options['d'])) {
+            echo "missing -d flag to choose dataset to use\n";
+            exit(1);
+        }
+        $activity = $options['a'];
+        $label = Classifier::cleanLabel($options['l']);
+        $dataset_name = $options['d'];
+        unset($options['a'], $options['l'], $options['d']);
+        foreach ($options as $opt_name => $value) {
+            switch ($opt_name) {
+            case 'S':
+                $this->setOptions($value);
+                break;
+            case 'I':
+                $this->setOptions($value, 'intval');
+                break;
+            case 'F':
+                $this->setOptions($value, 'floatval');
+                break;
+            case 'B':
+                $this->setOptions($value, 'boolval');
+                break;
+            default:
+                echo "unsupported option: {$opt_name}\n";
+                break;
+            }
+        }
+        return [$activity, $dataset_name, $label];
+    }
+
+    /**
+     * Parses the options, and if an appropriate activity exists, calls the
+     * activity, passing in the label and dataset to be used; otherwise, prints
+     * an error and exits.
+     */
+    public function main()
+    {
+        global $argv, $INSTRUCTIONS;
+        if (count($argv) < 2) {
+            echo $INSTRUCTIONS;
+            exit(1);
+        }
+        list($activity, $dataset_name, $label) = $this->parseOptions();
+        $method = "run{$activity}";
+        if (method_exists($this, $method)) {
+            $this->$method($label, $dataset_name);
+        } else {
+            echo "no activity: {$activity}\n\n";
+            exit(1);
+        }
+    }
+
+    /* ACTIVITIES */
+
+    /**
+     * Trains a classifier on a data set, testing at the specified intervals.
+     * The testing interval is set by the test_interval parameter. Each time
+     * this activity is run a new classifier is created (replacing an old one
+     * with the same label, if necessary), and the classifier remains at the
+     * end.
+     *
+     * @param string $label class label of the new classifier
+     * @param string $dataset_name name of the dataset to train and test on
+     */
+    public function runTrainAndTest($label, $dataset_name)
+    {
+        $this->setDefault('max_train', 200);
+        $this->logOptions();
+        $classifier = $this->makeFreshClassifier($label);
+        $data = $this->loadDataset($dataset_name, $label);
+        $classifier->initBuffer($data['train'], 0);
+        $pages = $data['train'];
+        $classifier->prepareToLabel();
+        $end = min($this->options['max_train'], $pages->length);
+        for ($i = 1; $i <= $end; $i++) {
+            $page = $pages->nextPage();
+            $doc_label = $page['TRUE_LABEL'];
+            $key = Classifier::makeKey($page);
+            $classifier->addBufferDoc($page, false);
+            $classifier->labelDocument($key, $doc_label, false);
+            if ($this->isTestPoint($i, $end)) {
+                Classifier::setClassifier($classifier);
+                $this->testClassifier($classifier, $data);
+                /*
+                   Testing the classifier puts it into "classify" mode, which
+                   will uses a different set of data from "label" mode, so it's
+                   important to switch back.
+                */
+                $classifier->prepareToLabel();
+            }
+        }
+    }
+    /**
+     * Like the TrainAndTest activity, but uses active training in order to
+     * choose the documents to add to the training set. The method simulates
+     * the process that an actual user would go through in order to label
+     * documents for addition to the training set, then tests performance at
+     * the specified intervals.
+     *
+     * @param string $label class label of the new classifier
+     * @param string $dataset_name name of the dataset to train and test on
+     */
+    public function runActiveTrainAndTest($label, $dataset_name)
+    {
+        $this->setDefault('max_train', 200);
+        $this->logOptions();
+        $classifier = $this->makeFreshClassifier($label);
+        $data = $this->loadDataset($dataset_name, $label);
+        $pages = $data['train'];
+        $classifier->prepareToLabel();
+        $classifier->initBuffer($pages);
+        $end = min($this->options['max_train'], $pages->length);
+        for ($i = 1; $i <= $end; $i++) {
+            list($new_doc, $disagreement) =
+                $classifier->findNextDocumentToLabel();
+            if ($new_doc) {
+                $key = Classifier::makeKey($new_doc);
+                $doc_label = $new_doc['TRUE_LABEL'];
+                $classifier->labelDocument($key, $doc_label);
+                $classifier->refreshBuffer($pages);
+                $classifier->computeBufferDensities();
+                $classifier->train();
+            }
+            if ($this->isTestPoint($i, $end)) {
+                Classifier::setClassifier($classifier);
+                $this->testClassifier($classifier, $data);
+                $classifier->prepareToLabel();
+            }
+        }
+    }
+
+    /* UTILITY METHODS */
+
+    /**
+     * Creates a new classifier for a label, first deleting any existing
+     * classifier with the same label.
+     *
+     * @param string $label class label of the new classifier
+     * @return object created classifier instance
+     */
+    public function makeFreshClassifier($label)
+    {
+        if ($classifier = Classifier::getClassifier($label)) {
+            $this->deleteClassifier($label);
+        }
+        $classifier = new Classifier($label, $this->options['cls']);
+        Classifier::setClassifier($classifier);
+        return $classifier;
+    }
+
+    /**
+     * Deletes an existing classifier, specified by its label.
+     *
+     * @param string $label class label of the existing classifier
+     */
+    public function deleteClassifier($label)
+    {
+        Classifier::deleteClassifier($label);
+        $mix_name = Classifier::getCrawlMixName($label);
+        $mix_time = $this->crawl_model->getCrawlMixTimestamp($mix_name);
+        if ($mix_time) {
+            $this->crawl_model->deleteCrawlMixIteratorState($mix_time);
+            $this->crawl_model->deleteCrawlMix($mix_time);
+        }
+    }
+    /**
+     * Fetches the summaries for pages in the indices specified by the passed
+     * dataset name. This method looks for existing indexes with names matching
+     * the dataset name prefix, and with suffix either "pos" or "neg" (ignoring
+     * case). The pages in these indexes are shuffled into one large array, and
+     * augmented with a TRUE_LABEL field that records which set they came from
+     * originally. The shuffled array is then split according to the `split'
+     * option, and all pages up to (but not including) the split index are used
+     * for the training set; the remaining pages are used for the test set.
+     *
+     * @param string $dataset_name prefix of index names to draw examples from
+     * @param string $class_label class label of the classifier the examples
+     * will be used to train (used to name the crawl mix that iterates over
+     * each index)
+     * @return array training and test datasets in an associative array with
+     * keys `train' and `test', where each dataset is wrapped up in a
+     * PageIterator that implements the CrawlMixIterator interface.
+     */
+    public function loadDataset($dataset_name, $class_label)
+    {
+        $crawls = $this->crawl_model->getCrawlList(false, true, null);
+        $dataset_name = preg_quote($dataset_name, '/');
+        $re = '/^RECRAWL::'.$dataset_name.' (pos|neg)$/i';
+        $pages = [];
+        foreach ($crawls as $crawl) {
+            if (!preg_match($re, $crawl['DESCRIPTION'], $groups)) {
+                continue;
+            }
+            $label = strtolower($groups[1]);
+            $doc_label = $label == 'pos' ? 1 : -1;
+            $mix_iterator =
+                $this->classifier_controller->buildClassifierCrawlMix(
+                    $class_label, $crawl['CRAWL_TIME']);
+            while (!$mix_iterator->end_of_iterator) {
+                $new_pages = $mix_iterator->nextPages(5000);
+                /*
+                   This field can be added to the results from a crawl mix
+                   iterator, but we don't care about it, so we just discard it.
+                */
+                if (isset($new_pages['NO_PROCESS'])) {
+                    unset($new_pages['NO_PROCESS']);
+                }
+                foreach ($new_pages as $page) {
+                    $page['TRUE_LABEL'] = $doc_label;
+                    $pages[] = $page;
+                }
+            }
+        }
+        shuffle($pages);
+        if (count($pages) < $this->options['split']) {
+            echo "split is larger than dataset\n";
+            exit(1);
+        }
+        $data = [];
+        $data['train'] = new PageIterator(
+            array_slice($pages, 0, $this->options['split']));
+        $data['test'] = new PageIterator(
+            array_slice($pages, $this->options['split']));
+        return $data;
+    }
+
+    /**
+     * Determines whether to run a classification test after a certain number
+     * of documents have been added to the training set. Whether or not to test
+     * is determined by the `test_interval' option, which may be either null,
+     * an integer, or a string. In the first case, testing only occurs after
+     * all training examples have been added; in the second case, testing
+     * occurs each time an additional constant number of training examples have
+     * been added; and in the final case, testing occurs on a fixed schedule of
+     * comma-separated offsets, such as "10,25,50,100".
+     *
+     * @param int $i the size of the current training set
+     * @param int $total the total number of documents available to be added to
+     * the training set
+     * @return bool true if the `test_interval' option specifies that a round
+     * of testing should occur for the current training offset, and false
+     * otherwise
+     */
+    public function isTestPoint($i, $total)
+    {
+        if (is_null($this->options['test_interval'])) {
+            return $i == $total;
+        } else if (is_int($this->options['test_interval'])) {
+            return $i % $this->options['test_interval'] == 0;
+        } else {
+            $re = '/(^|,)'.$i.'(,|$)/';
+            return preg_match($re, $this->options['test_interval']);
+        }
+    }
+    /**
+     * Finalizes the current classifier, uses it to classify all test
+     * documents, and logs the classification error.  The current classifier is
+     * saved to disk after finalizing (though not before), and left in
+     * `classify' mode. The iterator over the test dataset is reset for the
+     * next round of testing (if any).
+     *
+     * @param object $classifier classifier instance to test
+     * @param array $data the array of training and test datasets, constructed
+     * by loadDataset, of which only the `test' dataset it used.
+     */
+    public function testClassifier($classifier, $data)
+    {
+        $classifier->prepareToFinalize();
+        $classifier->finalize();
+        Classifier::setClassifier($classifier);
+        $classifier->prepareToClassify();
+        $wrong = 0;
+        $total = 0;
+        $pages = $data['test'];
+        while (!$pages->end_of_iterator) {
+            $page = $pages->nextPage();
+            $score = $classifier->classify($page);
+            $page_label = $score >= 0.5 ? 1 : -1;
+            if ($page_label != $page['TRUE_LABEL']) {
+                $wrong++;
+            }
+            $total++;
+        }
+        $error = (float)$wrong / $total;
+        $this->log(0, 'error = %.4f', $error);
+        $pages->reset();
+    }
+    /**
+     * Writes out logging information according to a detail level. The first
+     * argument is an integer (potentially negative) indicating the level of
+     * detail for the log message, where larger numbers indicate greater
+     * detail. Each message is prefixed with a character according to its level
+     * of detail, but if the detail level is greater than the level specified
+     * by the `debug' option then nothing is printed. The treatment for the
+     * available detail levels are as follows:
+     *
+     *    -2: Used for errors; always printed; prefix '! '
+     *    -1: Used for log of set options; always printed; prefix '# '
+     *    0+: Used for normal messages; prefix '> '
+     *
+     * The second argument is a printf-style string template specifying the
+     * message, and each following (optional) argument is used by the template.
+     * A newline is added automatically to each message.
+     *
+     * @param int $level level of detail for the message
+     * @param string $message printf-style template for the message
+     * @param string $args,... optional arguments to be used for the message
+     * template
+     */
+    public function log(/* varargs */)
+    {
+        $args = func_get_args();
+        $level = array_shift($args);
+        if ($level > $this->options['debug']) {
+            return;
+        }
+        if ($level == -2) {
+            echo '! ';
+        } else if ($level == -1) {
+            echo '# ';
+        } else {
+            echo '> ';
+        }
+        call_user_func_array('printf', $args);
+        echo "\n";
+    }
+    /**
+     * Logs the current options using the log method of this class. This method
+     * is used to explicitly state which settings were used for a given run of
+     * an activity. The detail level passed to the log method is -1.
+     *
+     * @param string $root folder to write to
+     * @param string $prefix to pre message (like Warning) to put at start of
+     *  log message
+     */
+    public function logOptions($root = null, $prefix = '')
+    {
+        if (is_null($root)) {
+            $root = $this->options;
+        }
+        foreach ($root as $key => $value) {
+            if (is_array($value)) {
+                $this->logOptions($value, $prefix.$key.'.');
+            } else if (!is_null($value)) {
+                if ($value === false) $value = 'false';
+                else if ($value === true) $value = 'true';
+                $this->log(-1, '%s%s = %s', $prefix, $key, strval($value));
+            }
+        }
+    }
+    /**
+     * Sets one or more options of the form NAME=VALUE according to a converter
+     * such as intval, floatval, and so on. The options may be passed in either
+     * as a string (a single option) or as an array of strings, where each
+     * string corresponds to an option of the same type (e.g., int).
+     *
+     * @param string|array $opts single option in the format NAME=VALUE, or
+     * array of options, each for the same target type (e.g., int)
+     * @param string $converter the name of a function that takes a string and
+     * casts it to a particular type (e.g., intval, floatval)
+     */
+    public function setOptions($opts, $converter = null)
+    {
+        if (!is_array($opts)) {
+            $opts = [$opts];
+        }
+        foreach ($opts as $opt) {
+            $split = strpos($opt, '=');
+            $name = substr($opt, 0, $split);
+            $value = substr($opt, $split + 1);
+            if ($converter) {
+                if ($converter == 'boolval' && !function_exists('boolval')) {
+                    $value = (bool)$value;
+                } else {
+                    $value = call_user_func($converter, $value);
+                }
+            }
+            $fields = explode('.', $name);
+            $field =& $this->options;
+            while (!empty($fields)) {
+                $top = array_shift($fields);
+                if (array_key_exists($top, $field)) {
+                    $field =& $field[$top];
+                } else {
+                    $this->log(-2, 'unknown option: "%s"', $name);
+                    break;
+                }
+            }
+            if (empty($fields)) {
+                $field = $value;
+            }
+        }
+    }
+
+    /**
+     * Sets a default value for a runtime parameter. This method is used by
+     * activities to specify default values that may be overridden by passing
+     * the appropriate command-line flag.
+     *
+     * @param string $name should end with name of runtime parameter to set
+     * @param string $value what to set it to
+     */
+    public function setDefault($name, $value)
+    {
+        $fields = explode('.', $name);
+        $field =& $this->options;
+        while (count($fields) > 1) {
+            $top = array_shift($fields);
+            $field =& $field[$top];
+        }
+        $last = array_shift($fields);
+        if (!isset($field[$last])) {
+            $field[$last] = $value;
+        }
+    }
+}
+/**
+ * This class provides the same interface as an iterator over crawl mixes, but
+ * simply iterates over an array.
+ *
+ * This is used to gather all of the pages for a training set in one go (using
+ * a crawl mix iterator), then repeatedly iterate over them in memory, as
+ * though they were coming from the original crawl mix iterator.
+ *
+ * @author Shawn Tice
+ */
+class PageIterator
+{
+    /**
+     * The array of pages to repeatedly iterate over.
+     * @var array
+     */
+    public $pages;
+
+    /**
+     * The total number of pages.
+     * @var int
+     */
+    public $length;
+
+    /**
+     * The current offset into the wrapped array.
+     * @var int
+     */
+    public $pos;
+
+    /**
+     * Whether or not the last page has been reached.
+     * @var bool
+     */
+    public $end_of_iterator;
+
+    /**
+     * Establishes a new iterator over a (potentially empty) array of pages.
+     *
+     * @param array $pages standard array of pages to iterate over
+     */
+    public function __construct($pages)
+    {
+        $this->pages = $pages;
+        $this->length = count($pages);
+        $this->reset();
+    }
+
+    /**
+     * Resets the iterator so that the next page will be the first.
+     */
+    public function reset()
+    {
+        $this->pos = 0;
+        $this->end_of_iterator = $this->length == 0;
+    }
+
+    /**
+     * Returns up to the requested number of next pages, potentially an empty
+     * array if there are no pages left. This method updates the
+     * `end_of_iterator' flag according to whether the last page has been
+     * returned.
+     *
+     * @param int $n maximum number of pages to return, or -1 to return all
+     * remaining pages
+     * @return array next $n pages, or less if there are fewer than $n
+     * pages remaining
+     */
+    public function nextPages($n = -1)
+    {
+        if ($n == -1) {
+            $n = $this->length - $this->pos;
+        } else {
+            $n = min($this->length - $this->pos, $n);
+        }
+        $start = $this->pos;
+        $this->pos += $n;
+        if ($this->pos == $this->length) {
+            $this->end_of_iterator = true;
+        }
+        return array_slice($this->pages, $start, $n);
+    }
+    /**
+     * Behaves like nextPages, but returns just the next page (not wrapped in
+     * an array) if there is one, and null otherwise.
+     *
+     * @return array next page if available, and null otherwise
+     */
+    public function nextPage()
+    {
+        $next = $this->nextPages(1);
+        return !empty($next) ? $next[0] : null;
+    }
+}
+try {
+    $classifier_tool = new ClassifierTool();
+    $classifier_tool->main();
+} catch (\ErrorException $e) {
+    echo $e . "\n";
+}
--- a/src/executables/ClassifierTrainer.php
+++ b/src/executables/ClassifierTrainer.php
@ -0,0 +1,105 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\executables;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlDaemon;
+use seekquarry\yioop\library\classifiers\Classifier;
+
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+/*
+   We must specify that we want logging enabled
+ */
+define("seekquarry\\yioop\\configs\\NO_LOGGING", false);
+/*
+   For crawlLog and Yioop Constants
+ */
+require_once __DIR__.'/../library/Utility.php';
+if (!C\PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+/*
+    We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+/*
+   If possible, set the memory limit high enough to fit all of the features and
+   training documents into memory.
+ */
+ini_set("memory_limit", "500M");
+/**
+ * This class is used to finalize a classifier via the web interface.
+ *
+ * Because finalizing involves training a logistic regression classifier on a
+ * potentially-large set of training examples, it can take much longer than
+ * would be allowed by the normal web execution time limit. So instead of
+ * trying to finalize a classifier directly in the controller that handles the
+ * web request, the controller kicks off a daemon that simply loads the
+ * classifier, finalizes it, and saves it back to disk.
+ *
+ * The classifier to finalize is specified by its class label, passed as the
+ * second command-line argument. The following command would be used to run
+ * this script directly from the command-line:
+ *
+ *    $ php bin/ClassifierTrainer.php terminal LABEL
+ *
+ * @author Shawn Tice
+ */
+class ClassifierTrainer
+{
+    /**
+     * This is the function that should be called to get the
+     * ClassifierTrainer to start training a logistic regression instance for
+     * a particular classifier. The class label corresponding to the
+     * classifier to be finalized should be passed as the second command-line
+     * argument.
+     */
+    public function start()
+    {
+        global $argv;
+        CrawlDaemon::init($argv, "ClassifierTrainer");
+        $label = $argv[2];
+        L\crawlLog("Initializing classifier trainer log..",
+            $label.'-ClassifierTrainer', true);
+        $classifier = Classifier::getClassifier($label);
+        $classifier->prepareToFinalize();
+        $classifier->finalize();
+        Classifier::setClassifier($classifier);
+        L\crawlLog("Training complete.\n");
+        CrawlDaemon::stop('ClassifierTrainer', $label);
+    }
+}
+$classifier_trainer = new ClassifierTrainer();
+$classifier_trainer->start();
--- a/src/executables/CodeTool.php
+++ b/src/executables/CodeTool.php
@ -0,0 +1,461 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Tool used to help coding with Yioop. Has commands to update copyright info,
+ * clean trailing spaces, find long lines, and do global file searches and
+ * replaces.
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\executables;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\models\Model;
+use seekquarry\yioop\library\Utility;
+
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+/** Load in global configuration settings */
+require_once __DIR__ . '/../configs/Config.php';
+if (!C\PROFILE) {
+    echo "Please configure the search engine instance by visiting " .
+        "its web interface on localhost.\n";
+    exit();
+}
+/*
+ * We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+$no_instructions = false;
+$model = new Model();
+$db = $model->db;
+$commands = ["copyright", "clean", "longlines", "search", "replace"];
+$change_extensions = ["php", "js", "ini", "css", "thtml", "xml"];
+$exclude_paths_containing = ["/.", "/extensions/"];
+$num_spaces_tab = 4;
+if (isset($argv[1]) && in_array($argv[1], $commands)) {
+    $command = C\NS_EXEC . $argv[1];
+    array_shift($argv);
+    array_shift($argv);
+    $no_instructions = $command($argv);
+}
+if (!$no_instructions) {
+    echo <<< EOD
+CodeTool.php has the following command formats:
+
+php CodeTool.php clean path
+    Replaces all tabs with four spaces and trims all whitespace off ends of
+    lines in the folder or file path. Removes trailing ?> from files
+    Adds a space between if, for, foreach, etc and ( if not present
+
+php CodeTool.php copyright path
+    Adjusts all lines in the files in the folder at path (or if
+    path is a file just that) of the form 2009 - \d\d\d\d to
+    the form 2009 - this_year where this_year is the current year.
+
+php CodeTool.php longlines path
+    Prints out all lines in files in the folder or file path which are
+    longer than 80 characters.
+
+php CodeTool.php replace path pattern replace_string
+  or
+php CodeTool.php replace path pattern replace_string effect
+    Prints all lines matching the regular expression pattern followed
+    by the result of replacing pattern with replace_string in the
+    folder or file path. Does not change files.
+
+php CodeTool.php replace path pattern replace_string interactive
+    Prints each line matching the regular expression pattern followed
+    by the result of replacing pattern with replace_string in the
+    folder or file path. Then it asks if you want to update the line.
+    Lines you choose for updating will be modified in the files.
+
+php CodeTool.php replace path pattern replace_string change
+    Each line matching the regular expression pattern is update
+    by replacing pattern with replace_string in the
+    folder or file path. This format doe not echo anything, it does a global
+    replace without interaction.
+
+php CodeTool.php search path pattern
+    Prints all lines matching the regular expression pattern in the
+    folder or file path.
+
+EOD;
+}
+/**
+ * Used to clean trailing whitespace from files in a folder or just from
+ * a file given in the command line. If also removes final ?> characters
+ * to make php files conform with suggested coding guidelines. Similarly,
+ * adds a space between if, for, foreach, etc and ( if not present to make
+ * match PHP coding guidelines
+ *
+ * @param array $args $args[0] contains path to sub-folder/file
+ * @return bool $no_instructions false if should output CodeTool.php
+ *     instructions
+ */
+function clean($args)
+{
+    global $num_spaces_tab;
+    $no_instructions = false;
+    if (isset($args[0])) {
+        $path = realpath($args[0]);
+        $no_instructions = true;
+        mapPath($path, C\NS_EXEC . "cleanLinesFile");
+    }
+    return $no_instructions;
+}
+/**
+ * Updates the copyright info (assuming in Yioop docs format) on files
+ * in supplied sub-folder/file. That is, it changes strings matching
+ * /2009 - \d\d\d\d/ to 2009 - current_year in those files/file.
+ *
+ * @param array $args $args[0] contains path to sub-folder/file
+ * @return bool $no_instructions false if should output CodeTool.php
+ *     instructions
+ */
+function copyright($args)
+{
+    $no_instructions = false;
+    if (isset($args[0])) {
+        $path = realpath($args[0]);
+        $year = date("Y");
+        $out_year = "2009 - ".$year;
+        replaceFile("", "/2009 \- \d\d\d\d/", $out_year, "change");
+            // initialize callback
+        mapPath($path, C\NS_EXEC . "replaceFile");
+        $no_instructions = true;
+    }
+    return $no_instructions;
+}
+/**
+ * Search and echos line numbers and lines for lines of length greater than 80
+ * characters in files in supplied sub-folder/file,
+ *
+ * @param array $args $args[0] contains path to sub-folder/file
+ * @return bool $no_instructions false if should output CodeTool.php
+ *     instructions
+ */
+function longlines($args)
+{
+    global $change_extensions;
+    $no_instructions = false;
+    $change_extensions = array_diff($change_extensions, ["ini", "xml"]);
+    if (isset($args[0])) {
+        $path = realpath($args[0]);
+        searchFile("", "/([^\n]){81}/u");// initialize callback
+        mapPath($path, C\NS_EXEC . "searchFile");
+        $no_instructions = true;
+    }
+    return $no_instructions;
+}
+/**
+ * Performs a search and replace for given pattern in files in supplied
+ * sub-folder/file
+ *
+ * @param array $args $args[0] contains path to sub-folder/file,
+ *     $args[1] contains the regex searching for, $args[2] contains
+ *     what it should be replaced with, $args[3] (defaults to effect)
+ *     controls the mode of operation. One of "effect", "change", or
+ *     "interactive". effect shows line number and lines matching pattern,
+ *     but commits no changes; interactive for each match, prompts user
+ *     if should do the change, change does a global search and replace
+ *     without output
+ * @return bool $no_instructions false if should output CodeTool.php
+ *     instructions
+ */
+function replace($args)
+{
+    $no_instructions = false;
+    if (isset($args[0]) && isset($args[1]) && isset($args[2])) {
+        $path = realpath($args[0]);
+        $no_instructions = true;
+        $pattern = $args[1];
+        $replace = $args[2];
+        $mode = (isset($args[3])) ? $args[3] : "effect";
+        $len = strlen($pattern);
+        if ($len >= 2) {
+            $pattern = preg_quote($pattern,"@");
+            $pattern = "@$pattern@";
+            replaceFile("", $pattern, $replace, $mode); // initialize callback
+
+            mapPath($path, C\NS_EXEC . "replaceFile");
+        }
+    }
+    return $no_instructions;
+}
+/**
+ * Performs a search for given pattern in files in supplied sub-folder/file
+ *
+ * @param array $args $args[0] contains path to sub-folder/file,
+ *     $args[1] contains the regex searching for
+ * @return bool $no_instructions false if should output CodeTool.php
+ *     instructions
+ */
+function search($args)
+{
+    $no_instructions = false;
+    if (isset($args[0]) && isset($args[1])) {
+        $path = realpath($args[0]);
+        $no_instructions = true;
+        $pattern = $args[1];
+        $len = strlen($pattern);
+        if ($len >= 2) {
+            $pattern = preg_quote($pattern, "@");
+            $pattern = "@$pattern@";
+            searchFile("", $pattern); // initialize callback
+            mapPath($path, C\NS_EXEC . "searchFile");
+        }
+    }
+    return $no_instructions;
+}
+/**
+ * Callback function applied to each file in the directory being traversed
+ * by @see copyright(). It checks if the files is of the extension of a code
+ * file and if so trims whitespace from its lines and then updates the lines
+ * of the form 2009 - \d\d\d\d to the supplied copyright year
+ *
+ * @param string $filename name of file to check for copyright lines and updated
+ * @param mixed $set_year if false then set the end of the copyright period
+ *  to the current year, otherwise, if an int sets it to the value of the int
+ */
+function changeCopyrightFile($filename, $set_year = false)
+{
+    global $change_extensions;
+    static $year = 2014;
+    if ($set_year) {
+        $year = $set_year;
+    }
+    $path_parts = pathinfo($filename);
+    $extension = $path_parts['extension'];
+    if (!excludedPath($filename) && in_array($extension, $change_extensions)) {
+        $lines = file($filename);
+        $out_lines = [];
+        $num_lines = count($lines);
+
+        $change = false;
+        foreach ($lines as $line) {
+            $new_line = preg_replace("/2009 \- \d\d\d\d/", $out_year,
+                $line);
+            $out_lines[] = $new_line;
+            if (strcmp($new_line, $line) != 0) {
+                $change = true;
+            }
+        }
+        $out_file = implode("\n", $out_lines);
+        if ($change) {
+            file_put_contents($filename, $out_file);
+        }
+    }
+}
+/**
+ * Callback function applied to each file in the directory being traversed
+ * by @see clean().
+ *
+ * @param string $filename name of file to clean lines for
+ */
+function cleanLinesFile($filename)
+{
+    global $change_extensions;
+    global $num_spaces_tab;
+    $spaces = str_repeat(" ", $num_spaces_tab);
+    $path_parts = pathinfo($filename);
+    $extension = $path_parts['extension'];
+    if (!excludedPath($filename) && in_array($extension, $change_extensions)) {
+        $lines = file($filename);
+        $out_lines = [];
+        $change = false;
+        $i = 0;
+        foreach ($lines as $line) {
+            $new_line = preg_replace("/\t/", $spaces, $line);
+            $count = 0;
+            $new_line = preg_replace('/(if|elseif|else|switch|case|".
+                "while|foreach|for|catch)\(/', "$1 (", $new_line);
+            $new_line = rtrim($new_line);
+            $out_lines[] = $new_line;
+            if (strcmp($new_line."\n", $line) != 0) {
+                $change = true;
+            }
+            $i++;
+        }
+        $last_line = $i - 1;
+        if ($new_line == '?>') {
+            $change = true;
+            $out_lines[$last_line] = "\n";
+        }
+        $out_file = implode("\n", $out_lines);
+        if ($change) {
+            file_put_contents($filename, $out_file);
+        }
+    }
+}
+/**
+ * Callback function applied to each file in the directory being traversed
+ * by @see search(). Searches $filename matching $pattern and outputs line
+ *     numbers and lines
+ *
+ * @param string $filename name of file to search in
+ * @param mixed $set_pattern if not false, then sets $set_pattern in $pattern to
+ *     initialize the callback on subsequent calls. $pattern here is the
+ *     search pattern
+ */
+function searchFile($filename, $set_pattern = false)
+{
+    global $change_extensions;
+    static $pattern = "/";
+    if ($set_pattern) {
+        $pattern = $set_pattern;
+    }
+    $path_parts = pathinfo($filename);
+    if (!isset($path_parts['extension'])) {
+        return;
+    }
+    $extension = $path_parts['extension'];
+    if (!excludedPath($filename) && in_array($extension, $change_extensions)) {
+        $lines = file($filename);
+        $no_output = true;
+        $num = 0;
+        foreach ($lines as $line) {
+            $num++;
+            if (preg_match($pattern, $line)) {
+                if ($no_output) {
+                    $no_output = false;
+                    echo "\nIn $filename:\n";
+                }
+                echo "  Line $num: $line";
+            }
+        }
+    }
+}
+/**
+ * Callback function applied to each file in the directory being traversed
+ * by @see replace(). Searches $filename matching $pattern. Depending
+ *     on $mode ($arg[2] as described in replace()), it outputs and
+ *     replaces with $replace
+ *
+ * @param string $filename name of file to search and replace in
+ * @param mixed $set_pattern if not false, then sets $set_pattern in $pattern to
+ *     initialize the callback on subsequent calls. $pattern here is the
+ *     search pattern
+ * @param mixed $set_replace if not false, then sets $set_replace in $replace to
+ *     initialize the callback on subsequent calls.
+ * @param mixed $set_mode if not false, then sets $set_mode in $mode to
+ *     initialize the callback on subsequent calls.
+ */
+function replaceFile($filename, $set_pattern = false,
+    $set_replace = false, $set_mode = false)
+{
+    global $change_extensions;
+    static $pattern = "/";
+    static $replace = "";
+    static $mode = "effect";
+
+    $pattern = ($set_pattern) ? $set_pattern : $pattern;
+    $replace = ($set_replace) ? $set_replace : $replace;
+    $mode = ($set_mode) ? $set_mode : $mode;
+
+    $path_parts = pathinfo($filename);
+    if (!isset($path_parts['extension'])) {
+        return;
+    }
+    $extension = $path_parts['extension'];
+    if (!excludedPath($filename) && in_array($extension, $change_extensions)) {
+        $lines = file($filename);
+        $out_lines = "";
+        $no_output = true;
+        $silent = false;
+        if ($mode == "change") {
+            $silent = true;
+        }
+        $num = 0;
+        $change = false;
+        foreach ($lines as $line) {
+            $num++;
+            $new_line = $line;
+            if (preg_match($pattern, $line)) {
+                if ($no_output && !$silent) {
+                    $no_output = false;
+                    echo "\nIn $filename:\n";
+                }
+                $new_line = preg_replace($pattern, $replace, $line);
+                if (!$silent) {
+                    echo "  Line $num: $line";
+                    echo "  Changes to: $new_line";
+                }
+                if ($mode == "interactive") {
+                    echo "Do replacement? (Yy - yes, anything else no): ";
+                    $confirm = strtolower(readInput());
+                    if ($confirm != "y") {
+                        $new_line = $line;
+                    }
+                }
+                if (strcmp($new_line, $line) != 0) {
+                    $change = true;
+                }
+            }
+            $out_lines .= $new_line;
+        }
+        if (in_array($mode, ["change", "interactive"])) {
+            if ($change) {
+                file_put_contents($filename, $out_lines);
+            }
+        }
+    }
+}
+/**
+ * Applies the function $callback to each file in $path
+ *
+ * @param string $path to apply map $callback to
+ * @param string $callback function name to call with filename of each file
+ *     in path
+ */
+function mapPath($path, $callback)
+{
+    global $db;
+    if (is_dir($path)) {
+        $db->traverseDirectory($path, $callback, true);
+    } else {
+        $callback($path);
+    }
+}
+/**
+ * Checks if $path is amongst a list of paths which should be ignored
+ *
+ * @param $path a directory path
+ * @return bool whether or not it should be ignored (true == ignore)
+ */
+function excludedPath($path)
+{
+    global $exclude_paths_containing;
+
+    foreach ($exclude_paths_containing as $exclude) {
+        if (strstr($path, $exclude)) {
+            return true;
+        }
+    }
+    return false;
+}
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
--- a/src/executables/MediaUpdater.php
+++ b/src/executables/MediaUpdater.php
@ -0,0 +1,215 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\executables;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\CrawlDaemon;
+use seekquarry\yioop\library\FetchUrl;
+use seekquarry\yioop\library\MediaConstants;
+use seekquarry\yioop\library\media_jobs\MediaJob;
+use seekquarry\yioop\library\WikiParser;
+
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+ini_set("memory_limit", "1300M");
+/** We do want logging, but crawl model and other will try to turn off
+ * if we don't set this
+ */
+define("seekquarry\\yioop\\configs\\NO_LOGGING", false);
+/** To guess language based on page encoding */
+require_once __DIR__."/../library/LocaleFunctions.php";
+if (!C\PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+/*
+ * We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+/**
+ * Separate process/command-line script which can be used to update
+ * news sources for Yioop and also handle other kinds of activities such as
+ * video conversion. This is as an alternative to using the web app
+ * for updating. Makes use of the web-apps code.
+ *
+ * @author Chris Pollett
+ */
+class MediaUpdater implements CrawlConstants
+{
+    /**
+     * Shortest time through one iteration of news updater's loop
+     */
+    const MINIMUM_UPDATE_LOOP_TIME = 10;
+    /**
+     * The last time feeds were checked for updates
+     * @var int
+     */
+    public $update_time;
+    /**
+     * If true then it is assumed that mail should be
+     * sent using a media updater rather than from within the web app
+     *
+     * @var bool
+     */
+    public $mail_mode;
+    /**
+     * Controls whether media updating should be viewed as only occurring
+     * on the name server or should it be viewed as a distributed process
+     * amongst all machines in this Yioop instance
+     * @var string
+     */
+    public $media_mode;
+    /**
+     * List of job this media updater performs
+     * @var array
+     */
+    public $jobs;
+    /**
+     * Sets up the field variables so that media updating can begin
+     */
+    public function __construct()
+    {
+        global $argv;
+        $this->delete_time = 0;
+        $this->retry_time = 0;
+        $this->update_time = 0;
+        $this->media_mode = "name_server";
+        $this->media_mode = false;
+    }
+    /**
+     * This is the function that should be called to get the MediaUpdater to
+     * start to start updating. Calls init to handle the command-line
+     * arguments then enters news_updaters main loop
+     */
+    public function start()
+    {
+        global $argv;
+        CrawlDaemon::init($argv, "MediaUpdater");
+        L\crawlLog("\n\nInitialize logger..", "MediaUpdater", true);
+        L\crawlLog("Acquiring list of jobs...");
+        $job_path = C\BASE_DIR ."/library/media_jobs/";
+        $app_job_path = C\APP_DIR ."/library/media_jobs/";
+        $len_path = strlen($job_path);
+        if (empty($argv[2])) {
+            $base_job_files = glob("$job_path*Job.php");
+            $job_files = glob("$app_job_path*Job.php");
+            foreach ($base_job_files as $job_file) {
+                $app_equiv_job = $app_job_path . substr($job_file, $len_path);
+                if (!in_array($app_equiv_job, $job_files)) {
+                    $job_files[] = $job_file;
+                }
+            }
+        } else {
+            $job_files = [];
+            $pre_jobs = array_slice($argv, 2);
+            foreach ($pre_jobs as $pre_job) {
+                if (file_exists($app_job_path . "{$pre_job}Job.php")) {
+                    $job_files[] = $app_job_path . "{$pre_job}Job.php";
+                } else if (file_exists($job_path . "{$pre_job}Job.php")) {
+                    $job_files[] = $job_path . "{$pre_job}Job.php";
+                }
+            }
+        }
+        foreach ($job_files as $job_file) {
+            require_once $job_file;
+            $job_name = C\NS_JOBS . substr($job_file, $len_path, -4);
+            if ($job_name != C\NS_JOBS . "MediaJob") {
+                $job = new $job_name($this);
+                $this->jobs[] = $job;
+                L\crawlLog("... loading $job_name");
+            }
+        }
+        $this->loop();
+    }
+    /**
+     * Main loop for the news updater.
+     */
+    public function loop()
+    {
+        L\crawlLog("In Media Update Loop");
+        L\crawlLog("PHP Version in use:  " . phpversion());
+        $info[self::STATUS] = self::CONTINUE_STATE;
+        $local_archives = [""];
+        while (CrawlDaemon::processHandler()) {
+            $start_time = microtime(true);
+            $this->getUpdateProperties();
+            if (!empty($this->jobs)) {
+                foreach ($this->jobs as $job) {
+                    $job->run();
+                }
+            }
+            $sleep_time = max(0, ceil(self::MINIMUM_UPDATE_LOOP_TIME -
+                    L\changeInMicrotime($start_time)));
+            if ($sleep_time > 0) {
+                L\crawlLog("Ensure minimum loop time by sleeping...".
+                    $sleep_time);
+                sleep($sleep_time);
+            }
+        } //end while
+        L\crawlLog("Media Updater shutting down!!");
+    }
+    /**
+     * Makes a request to the name server to find out if we are running
+     * as a media updater just on the name server or on both the name server
+     * as well as all other machines in the Yioop instance
+     */
+    public function getUpdateProperties()
+    {
+        L\crawlLog("Checking Name Server for Media Updater properties...");
+        $current_machine = MediaJob::getCurrentMachine();
+        $properties = MediaJob::execNameServer(
+            "getUpdateProperties");
+        if ($properties) {
+            if (isset($properties['MEDIA_MODE'])) {
+                $this->media_mode = $properties['MEDIA_MODE'];
+                L\crawlLog("...Setting media mode to: " .
+                    $properties['MEDIA_MODE']);
+            }
+            if (isset($properties['SEND_MAIL_MEDIA_UPDATER'])) {
+                $this->mail_mode = (
+                    $properties['SEND_MAIL_MEDIA_UPDATER']== "true") ?
+                    true : false;
+                L\crawlLog("...Setting mail mode to: " .
+                    (($this->mail_mode) ? "true" : "false"));
+            }
+        }
+        L\crawlLog("Done checking Name Server for Media Updater properties");
+    }
+}
+/*
+ * Instantiate and run the MediaUpdater program
+ */
+$media_updater =  new MediaUpdater();
+$media_updater->start();
+
--- a/src/executables/Mirror.php
+++ b/src/executables/Mirror.php
@ -0,0 +1,338 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\executables;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\CrawlDaemon;
+use seekquarry\yioop\library\FetchUrl;
+
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+ini_set("memory_limit","850M"); //so have enough memory to crawl big pages
+
+/** CRAWLING means don't try to use memcache
+ * @ignore
+ */
+define("seekquarry\\yioop\\configs\\NO_CACHE", true);
+/** for crawlHash and crawlLog and Yioop constants */
+require_once __DIR__."/../library/Utility.php";
+if (!C\PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+/*
+ * We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+/**
+ * This class is responsible for syncing crawl archives between machines using
+ * the SeekQuarry/Yioop search engine
+ *
+ * Mirror periodically queries the queue server asking for a list of files that
+ * have changed in its parent since the last sync time. It then proceeds to
+ * download them.
+ *
+ * @author Chris Pollett
+ */
+class Mirror implements CrawlConstants
+{
+    /**
+     * Reference to a database object. Used since has directory manipulation
+     * functions
+     * @var object
+     */
+    public $db;
+    /**
+     * Url or IP address of the name_server to get sites to crawl from
+     * @var string
+     */
+    public $name_server;
+
+    /**
+     * Last time a sync list was obtained from master machines
+     * @var string
+     */
+    public $last_sync;
+    /**
+     * Last time the machine being mirrored was notified Mirror.php is still
+     * running
+     * @var string
+     */
+    public $last_notify;
+    /**
+     * File name where last sync time is written
+     * @var string
+     */
+    public $last_sync_file;
+    /**
+     * Time of start of current sync
+     * @var string
+     */
+    public $start_sync;
+    /**
+     * Files to download for current sync
+     * @var string
+     */
+    public $sync_schedule;
+    /**
+     * Directory to sync
+     * @var string
+     */
+    public $sync_dir;
+    /**
+     * Url of the Yioop instance we are mirroring
+     * @var string
+     */
+    public $parent_url;
+    /**
+     * Maximum number of bytes from a file to download in one go
+     */
+    const DOWNLOAD_RANGE = 50000000;
+    /**
+     * Sets up the field variables so that syncing can begin
+     *
+     * @param string $name_server URL or IP address of the name server
+     */
+    public function __construct($name_server)
+    {
+        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager";
+        $this->db = new $db_class();
+        $this->name_server = $name_server;
+        $this->last_sync_file = C\CRAWL_DIR."/schedules/last_sync.txt";
+        if (file_exists($this->last_sync_file)) {
+            $this->last_sync = unserialize(
+                file_get_contents($this->last_sync_file));
+        } else {
+            $this->last_sync = 0;
+        }
+        $this->start_sync = $this->last_sync;
+        $this->last_notify = $this->last_sync;
+        $this->sync_schedule = [];
+        $this->sync_dir = C\CRAWL_DIR."/cache";
+        $this->parent_url = $name_server;
+    }
+    /**
+     * This is the function that should be called to get the mirror to start
+     * syncing. Calls init to handle the command line arguments then enters
+     * the syncer's main loop
+     */
+    public function start()
+    {
+        global $argv;
+        CrawlDaemon::init($argv, "Mirror");
+        L\crawlLog("\n\nInitialize logger..", "mirror", true);
+        $this->loop();
+    }
+    /**
+     * Main loop for the mirror script.
+     *
+     */
+    public function loop()
+    {
+        L\crawlLog("In Sync Loop");
+        L\crawlLog("PHP Version in use: " . phpversion());
+        $info[self::STATUS] = self::CONTINUE_STATE;
+        while (CrawlDaemon::processHandler()) {
+            $syncer_message_file = C\CRAWL_DIR .
+                "/schedules/MirrorMessages.txt";
+            if (file_exists($syncer_message_file)) {
+                $info = unserialize(file_get_contents($syncer_message_file));
+                unlink($syncer_message_file);
+                if (isset($info[self::STATUS]) &&
+                    $info[self::STATUS] == self::STOP_STATE) {
+                    continue;
+                }
+            }
+            $parent_file = C\CRAWL_DIR . "/schedules/mirror_parent.txt";
+            if (file_exists($parent_file)) {
+                $this->parent_url = file_get_contents($parent_file);
+                L\crawlLog("Read File: " . $parent_file . ".");
+                L\crawlLog("Set parent server to: " . $this->parent_url);
+            } else {
+                L\crawlLog("File: " . $parent_file . " does not exist!");
+                L\crawlLog("Assuming parent is name server: ".
+                    $this->name_server);
+                $this->parent_url = $this->name_server;
+            }
+            $info = $this->checkScheduler();
+            if ($info === false) {
+                L\crawlLog("Cannot connect to parent server...".
+                    " will try again in ".
+                    C\MIRROR_NOTIFY_FREQUENCY." seconds.");
+                sleep(C\MIRROR_NOTIFY_FREQUENCY);
+                continue;
+            }
+            if ($info[self::STATUS] == self::NO_DATA_STATE) {
+                L\crawlLog("No data from parent server. Sleeping...");
+                sleep(C\MIRROR_NOTIFY_FREQUENCY);
+                continue;
+            }
+            $this->copyNextSyncFile();
+        } //end while
+        L\crawlLog("Mirror shutting down!!");
+    }
+    /**
+     * Gets status and, if done processing all other mirroring activities,
+     * gets a new list of files that have changed since the last synchronization
+     * from the web app of the machine we are mirroring with.
+     *
+     * @return mixed array or bool. Returns false if weren't successful in
+     *     contacting web app, otherwise, returns an array with a status
+     *     and potentially a list of files ot sync
+     */
+    public function checkScheduler()
+    {
+        $info = [];
+        $server = $this->parent_url;
+        $start_time = microtime(true);
+        $time = time();
+        $session = md5($time . C\AUTH_KEY);
+        $write_sync_time = true;
+        $request =
+            $server.
+            "?c=resource&time=$time&session=$session".
+            "&robot_instance=".C\ROBOT_INSTANCE."&machine_uri=".C\WEB_URI.
+            "&last_sync=".$this->last_sync;
+        if ($this->start_sync <= $this->last_sync &&
+            $this->last_sync + C\MIRROR_SYNC_FREQUENCY < $time) {
+            $request .= "&a=syncList";
+            L\crawlLog("Getting Sync List...");
+            $info_string = FetchUrl::getPage($request, null, true);
+            if ($info_string === false) {
+                return false;
+            }
+            $this->last_notify = $time;
+            $info_string = trim($info_string);
+            $info = unserialize(gzuncompress(base64_decode($info_string)));
+            if (isset($info[self::STATUS]) &&
+                $info[self::STATUS] == self::CONTINUE_STATE) {
+                $this->start_sync = $time;
+                $this->sync_schedule = $info[self::DATA];
+                unset($info[self::DATA]);
+            } else if (isset($info[self::STATUS]) &&
+                $info[self::STATUS] == self::NO_DATA_STATE) {
+                $this->last_sync = $time;
+                $this->start_sync = $time;
+                $write_sync_time = false;
+            }
+        } else {
+            $info[self::STATUS] = ($this->last_sync == $this->start_sync) ?
+                self::NO_DATA_STATE : self::CONTINUE_STATE;
+            L\crawlLog("Current time $time, last notify time ".
+                $this->last_notify."...");
+            if ($time - $this->last_notify > C\MIRROR_NOTIFY_FREQUENCY) {
+                $request .= "&a=syncNotify";
+                FetchUrl::getPage($request, null, true);
+                $this->last_notify = $time;
+                L\crawlLog("Notifying master that mirror is alive..");
+            } else {
+                L\crawlLog("So not notifying scheduler..");
+            }
+        }
+        if (count($this->sync_schedule) == 0 && $write_sync_time) {
+            $this->last_sync = $this->start_sync;
+            $this->db->setWorldPermissionsRecursive($this->sync_dir, true);
+            file_put_contents($this->last_sync_file,
+                serialize($this->last_sync));
+        }
+        L\crawlLog("  Time to check Scheduler ".
+            L\changeInMicrotime($start_time));
+        return $info;
+    }
+    /**
+     * Downloads the next file from the schedule of files to download received
+     * from the web app.
+     */
+    public function copyNextSyncFile()
+    {
+        $dir = $this->sync_dir;
+        $server = $this->parent_url;
+        $time = time();
+        $session = md5($time . C\AUTH_KEY);
+        if (count($this->sync_schedule) <= 0) return;
+        $file = array_pop($this->sync_schedule);
+        L\crawlLog("Start syncing {$file['name']}..");
+        if ($file['is_dir'] ) {
+            if (!file_exists("$dir/{$file['name']}")) {
+                mkdir("$dir/{$file['name']}");
+                L\crawlLog(".. {$file['name']} directory created.");
+            } else {
+                L\crawlLog(".. {$file['name']} directory exists.");
+            }
+        } else {
+            $request =
+                "$server?c=resource&a=get&time=$time&session=$session".
+                "&f=cache&n=" . urlencode($file["name"]);
+            if ($file["size"] < self::DOWNLOAD_RANGE) {
+                $data = FetchUrl::getPage($request, null, true);
+                if ($file["size"] != strlen($data)) {
+                    array_push($this->sync_schedule, $file);
+                    L\crawlLog(".. {$file['name']} error ".
+                        "downloading, retrying.");
+                    return;
+                }
+                file_put_contents("$dir/{$file['name']}", $data);
+                L\crawlLog(".. {$file['name']} file copied.");
+            } else {
+                $offset = 0;
+                $fh = fopen("$dir/{$file['name']}", "wb");
+                $request .= "&l=".self::DOWNLOAD_RANGE;
+                while($offset < $file['size']) {
+                    $data = FetchUrl::getPage($request."&o=$offset", null,
+                        true);
+                    $old_offset = $offset;
+                    $offset += self::DOWNLOAD_RANGE;
+                    $end_point = min($offset, $file["size"]);
+                    //crude check if we need to redownload segment
+                    if (strlen($data) != ($end_point - $old_offset)) {
+                        $offset = $old_offset;
+                        L\crawlLog(".. Download error re-requesting segment");
+                        continue;
+                    }
+                    fwrite($fh, $data);
+                    L\crawlLog(".. {$file['name']} downloaded bytes $old_offset ".
+                        "to $end_point..");
+                }
+                L\crawlLog(".. {$file['name']} file copied.");
+                fclose($fh);
+            }
+        }
+    }
+}
+/*
+ * Instantiate and runs the Mirror program
+ */
+$syncer =  new Mirror(C\NAME_SERVER);
+$syncer->start();
+
--- a/src/executables/QueryTool.php
+++ b/src/executables/QueryTool.php
@ -0,0 +1,152 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\executables;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\FileCache;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\controllers\SearchController;
+
+if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+/** so can output plans */
+define("seekquarry\\yioop\\configs\\QUERY_STATISTICS", true);
+/** Loads common constants for web crawling*/
+require_once __DIR__."/../library/LocaleFunctions.php";
+if (!C\PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+/*
+ * We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+/**
+ * Tool to provide a command line query interface to indexes stored in
+ * Yioop! database. Running with no arguments gives a help message for
+ * this tool.
+ *
+ * @author Chris Pollett
+ */
+class QueryTool implements CrawlConstants
+{
+    /**
+     * Initializes the QueryTool, for now does nothing
+     */
+    public function __construct()
+    {
+    }
+    /**
+     * Runs the QueryTool on the supplied command line arguments
+     */
+    public function start()
+    {
+        global $argv;
+        if (!isset($argv[1])) {
+            $this->usageMessageAndExit();
+        }
+        $query = $argv[1];
+        $results_per_page = (isset($argv[2])) ? $argv[2] : 10;
+        $limit = (isset($argv[3])) ? $argv[3] : 0;
+        L\setLocaleObject((isset($argv[4])) ? $argv[4] : C\DEFAULT_LOCALE);
+        $start_time = microtime(true);
+        $controller = new SearchController();
+        $data = $controller->queryRequest($query, $results_per_page, $limit);
+        if (isset($argv[2]) && ($argv[2] == "plan" || $argv[2] == "explain")) {
+            echo "\n" . $controller->model("phrase")->db->query_log[0]["PLAN"]
+                ."\n";
+            exit();
+        }
+        if (!isset($data['PAGES'])) {
+            $data['PAGES'] = [];
+        }
+        foreach ($data['PAGES'] as $page) {
+            echo "============\n";
+            echo "TITLE: ". trim($page[self::TITLE]). "\n";
+            echo "URL: ". trim($page[self::URL]). "\n";
+            echo "IPs: ";
+            if (isset($page[self::IP_ADDRESSES])) {
+                foreach ($page[self::IP_ADDRESSES] as $address) {
+                    echo $address." ";
+                }
+            }
+            echo "\n";
+            echo "DESCRIPTION: ".wordwrap(trim($page[self::DESCRIPTION]))."\n";
+            echo "Rank: ".$page[self::DOC_RANK]."\n";
+            echo "Relevance: ".$page[self::RELEVANCE]."\n";
+            echo "Proximity: ".$page[self::PROXIMITY]."\n";
+            echo "Score: ".$page[self::SCORE]."\n";
+            echo "============\n\n";
+        }
+        $data['ELAPSED_TIME'] = L\changeInMicrotime($start_time);
+        echo "QUERY STATISTICS\n";
+
+        echo "============\n";
+        echo "ELAPSED TIME: ".$data['ELAPSED_TIME']."\n";
+        if (isset($data['LIMIT'])) {
+            echo "LOW: ".$data['LIMIT']."\n";
+        }
+        if (isset($data['HIGH'])) {
+            echo "HIGH: ".min($data['TOTAL_ROWS'],
+                $data['LIMIT'] + $data['RESULTS_PER_PAGE'])."\n";
+        }
+        if (isset($data['TOTAL_ROWS'])) {
+            echo "TOTAL ROWS: ".$data['TOTAL_ROWS']."\n";
+        }
+        if (isset($data['ERROR'])) {
+            echo $data['ERROR']."\n";
+        }
+    }
+    /**
+     * Outputs the "how to use this tool message" and then exit()'s.
+     */
+    public function usageMessageAndExit()
+    {
+        echo "\nQueryTool.php is used to run a Yioop";
+        echo " query from the command line.\n For example,\n";
+        echo "  php QueryTool.php 'chris pollett' \n returns results ".
+            "from the default index of a search on 'chris pollett'.\n";
+        echo "The general command format is:\n";
+        echo "  php QueryTool.php query num_results start_num lang_tag\n\n";
+        echo "QueryTool.php can also be used to explain the plan by which\n";
+        echo "Yioop will compute query results. For this usage one types:\n";
+        echo "  php QueryTool.php query plan\n";
+        echo "or\n";
+        echo "  php QueryTool.php query explain\n";
+        echo "For example,";
+        echo "  php QueryTool.php 'chris pollett' explain\n";
+        exit();
+    }
+}
+$query_tool =  new QueryTool();
+$query_tool->start();
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
--- a/src/favicon.ico
+++ b/src/favicon.ico
--- a/src/index.php
+++ b/src/index.php
@ -0,0 +1,615 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * Main web interface entry point for Yioop!
+ * search site. Used to both get and display
+ * search results. Also used for inter-machine
+ * communication during crawling
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+
+/**
+ * Main entry point to the Yioop web app.
+ *
+ * Initialization is done in  a function to avoid polluting the global
+ * namespace with variables.
+ */
+function bootstrap()
+{
+    /**
+     * For error function and yioop constants
+     */
+    require_once __DIR__ . "/library/Utility.php";
+    /**
+     * Did we come to this index.php from ../index.php? If so, rewriting
+     * must be on
+     */
+    if (!C\nsdefined("REDIRECTS_ON")) {
+        C\nsdefine("REDIRECTS_ON", false);
+    }
+    /**
+     * Check if doing url rewriting, and if so, do initial routing
+     */
+    configureRewrites();
+    if ((C\DEBUG_LEVEL & C\ERROR_INFO) == C\ERROR_INFO) {
+        set_error_handler(C\NS_LIB . "yioop_error_handler");
+    }
+    /**
+     * Load global functions related to localization
+     */
+    require_once __DIR__."/library/LocaleFunctions.php";
+    ini_set("memory_limit","500M");
+    if (empty($_REQUEST['c']) || $_REQUEST['c'] != 'resource') {
+        header("X-FRAME-OPTIONS: DENY"); //prevent click-jacking
+    }
+    header("X-Content-Type-Options: nosniff"); /*
+        Let browsers know that we should be setting the mimetype correctly --
+        For none dumb browsers this should help prevent against XSS attacks
+        to images containing HTML. Also, might help against PRSSI attacks.
+        */
+    if (session_status() == PHP_SESSION_NONE) {
+        session_name(C\SESSION_NAME);
+        session_start();
+    }
+    /**
+     * Load global functions related to checking Yioop! version
+     */
+    require_once C\BASE_DIR."/library/UpgradeFunctions.php";
+    if (!function_exists('mb_internal_encoding')) {
+        echo "PHP Zend Multibyte Support must be enabled for Yioop! to run.";
+        exit();
+    }
+    /**
+     * Make an initial setting of controllers. This can be overridden in
+     * local_config
+     */
+    $available_controllers = ["admin", "api", "archive",  "cache",
+        "classifier", "crawl", "fetch", "group", "jobs", "machine", "resource",
+        "search", "settings", "static"];
+    if (function_exists(C\NS_CONFIGS . "localControllers")) {
+        $available_controllers = array_merge($available_controllers,
+            C\localControllers());
+    }
+    if (in_array(C\REGISTRATION_TYPE, ['no_activation', 'email_registration',
+        'admin_activation'])) {
+        $available_controllers[] = "register";
+    }
+    if (!C\WEB_ACCESS) {
+        $available_controllers = ["admin", "archive", "cache", "crawl","fetch",
+            "jobs", "machine"];
+    }
+    //the request variable c is used to determine the controller
+    if (!isset($_REQUEST['c'])) {
+        $controller_name = "search";
+        if (C\nsdefined('LANDING_PAGE') && C\LANDING_PAGE &&
+            !isset($_REQUEST['q'])) {
+            $controller_name = "static";
+            $_REQUEST['c'] = "static";
+            $_REQUEST['p'] = "Main";
+        }
+    } else {
+        $controller_name = $_REQUEST['c'];
+    }
+    if (!in_array($controller_name, $available_controllers))
+    {
+        if (C\WEB_ACCESS) {
+            $controller_name = "search";
+        } else {
+            $controller_name = "admin";
+        }
+    }
+    // if no profile exists we force the page to be the configuration page
+    if (!C\PROFILE || (C\nsdefined("FIX_NAME_SERVER") && C\FIX_NAME_SERVER)) {
+        $controller_name = "admin";
+    }
+    $locale_tag = L\getLocaleTag();
+    if (L\upgradeDatabaseWorkDirectoryCheck()) {
+        /**
+         * Load global functions needed to upgrade between versions
+         * (note only do this if need to upgrade)
+         */
+        require_once C\BASE_DIR."/library/VersionFunctions.php";
+        L\upgradeDatabaseWorkDirectory();
+    }
+    if (L\upgradeLocalesCheck($locale_tag)) {
+        L\upgradeLocales();
+    }
+    //upgrade manipulations might mess with globale locale, so set it back here
+    L\setLocaleObject($locale_tag);
+    /**
+     * Loads controller responsible for calculating
+     * the data needed to render the scene
+     *
+     */
+    $controller_class = C\NS_CONTROLLERS . ucfirst($controller_name) .
+        "Controller";
+    $controller = new $controller_class();
+    $controller->processRequest();
+}
+/**
+ * Used to setup and handles url rewriting for the Yioop Web app
+ *
+ * Developers can add new routes by creating a Routes class in
+ * the app_dir with a static method getRoutes which should return
+ * an associating array of incoming_path => handler function
+ */
+function configureRewrites()
+{
+    $route_map = [
+        'advertise' => 'routeDirect',
+        'blog' => 'routeDirect',
+        'bot' => 'routeDirect',
+        'privacy' => 'routeDirect',
+        'terms' => 'routeDirect',
+        'admin' => 'routeController',
+        'register' => 'routeController',
+        'settings' => 'routeController',
+        's' => "routeSubsearch",
+        'more' => 'routeMore',
+        'suggest' => 'routeSuggest',
+        'group' => 'routeFeeds',
+        'thread' => 'routeFeeds',
+        'user' => 'routeFeeds',
+        'p' => 'routeWiki'
+    ];
+    if (class_exists(C\NS. "Routes")) {
+        $route_map = array_merge($route_map, Routes::getRoutes());
+    }
+    /**
+     * Check for paths of the form index.php/something which yioop doesn't
+     * support
+     */
+    $s_name = $_SERVER['SCRIPT_NAME']."/";
+    $path_name = substr($_SERVER["REQUEST_URI"], 0, strlen($s_name));
+    if (strcmp($path_name, $s_name) == 0) {
+        $_SERVER["PATH_TRANSLATED"] = C\BASE_DIR;
+        $scriptinfo = pathinfo($s_name);
+        $_SERVER["PATH_INFO"] = ($scriptinfo["dirname"] == "/") ? "" :
+            $scriptinfo["dirname"] ;
+        require_once(C\BASE_DIR."/error.php");
+        if (C\REDIRECTS_ON) {
+            return;
+        }
+        exit();
+    }
+    if (!isset($_SERVER["PATH_INFO"])) {
+        $_SERVER["PATH_INFO"] = ".";
+    }
+    if (!C\REDIRECTS_ON) {
+        return;
+    }
+    /**
+     * Now look for and handle routes
+     */
+    $index_php = "index.php";
+    $script_path = substr($_SERVER['PHP_SELF'], 0, -strlen($index_php));
+    if ($_SERVER['QUERY_STRING'] == "") {
+        $request_script = rtrim(
+            substr($_SERVER['REQUEST_URI'], strlen($script_path)), "?");
+    } else {
+        $request_script = substr($_SERVER['REQUEST_URI'], strlen($script_path),
+            -strlen($_SERVER['QUERY_STRING']) -  1);
+    }
+    $request_script = ($request_script == "") ? $index_php : $request_script;
+    if (in_array($request_script, ['', $index_php])) {
+        return;
+    }
+    $request_parts = explode("/", $request_script);
+    $handled = false;
+    if (isset($route_map[$request_parts[0]])) {
+        if (empty($_REQUEST['c']) || $_REQUEST['c'] == $request_parts[0]) {
+            $route = C\NS . $route_map[$request_parts[0]];
+            $handled = $route($request_parts);
+        } else if (!empty($_REQUEST['c'])) {
+            $handled = true;
+        }
+    }
+    if (!$handled) {
+        $_REQUEST['p'] = "404";
+        require_once __DIR__."/error.php";
+    }
+}
+/**
+ * Used to route page requests to pages that are fixed Public Group wiki
+ * that should always be present. For example, 404 page.
+ *
+ * @param array $route_args of url parts (split on slash).
+ * @return bool whether was able to compute a route or not
+ */
+function routeDirect($route_args)
+{
+    $_REQUEST['route']['c'] = true;
+    require_once __DIR__ . "/". $route_args[0] . ".php";
+    return true;
+}
+/**
+ * Given the name of a fixed public group static page creates the url
+ * where it can be accessed in this instance of Yioop, making use of the
+ * defined variable REDIRECTS_ON.
+ *
+ * @param string $name of static page
+ * @param bool $with_delim whether it should be terminated with nothing or
+ *      ? or &
+ * @return string url for the page in question
+ */
+function directUrl($name, $with_delim = false)
+{
+    if (C\REDIRECTS_ON) {
+        $delim = ($with_delim) ? "?" : "";
+        return C\BASE_URL . $name . $delim;
+    } else {
+        $delim = ($with_delim) ? "&" : "";
+        return C\BASE_URL . "$name.php$delim";
+    }
+}
+/**
+ * Used to route page requests for pages corresponding to a group, user,
+ * or thread feed. If redirects on then urls ending with /feed_type/id map
+ * to a page for the id'th item of that feed_type
+ *
+ * @param array $route_args of url parts (split on slash).
+ * @return bool whether was able to compute a route or not
+ */
+function routeFeeds($route_args)
+{
+    $handled = true;
+    if (isset($route_args[1]) && $route_args[1] == intval($route_args[1])) {
+        $_REQUEST['c'] = "group";
+        if (!empty($route_args[2])) {
+            $_REQUEST['a'] = 'wiki';
+            if ($route_args[2] == 'pages') {
+                $_REQUEST['arg'] = 'pages';
+                $_REQUEST['route']['arg'] = true;
+            } else {
+                $_REQUEST['page_name'] = $route_args[2];
+                $_REQUEST['route']['page_name'] = true;
+            }
+        }
+        $_REQUEST['a'] = (isset($_REQUEST['a']) &&
+            $_REQUEST['a'] == 'wiki') ? $_REQUEST['a'] : "groupFeeds";
+        $_REQUEST['route']['c'] = true;
+        $_REQUEST['route']['a'] = true;
+        $end = ($route_args[0] == 'thread') ? "" : "_id";
+        if ($_REQUEST['a'] == 'wiki') {
+            $_REQUEST['group_id'] = $route_args[1];
+            $_REQUEST['route']['group_id'] = true;
+        } else {
+            $just_id = "just_" . $route_args[0] . $end;
+            $_REQUEST[$just_id] = $route_args[1];
+            $_REQUEST['route'][$just_id] = true;
+        }
+    } else if (!isset($route_args[1])) {
+        $_REQUEST['c'] = "group";
+        $_REQUEST['a'] = (isset($_REQUEST['a']) &&
+            $_REQUEST['a'] == 'wiki') ? $_REQUEST['a'] : "groupFeeds";
+        $_REQUEST['route']['c'] = true;
+        $_REQUEST['route']['a'] = true;
+    } else {
+        $handled = false;
+    }
+    return $handled;
+}
+/**
+ * Given the type of feed, the identifier of the feed instance, and which
+ * controller is being used creates the url where that feed item can be
+ * accessed from the instance of Yioop. It makes use of the
+ * defined variable REDIRECTS_ON.
+ *
+ * @param string $type of feed: group, user, thread
+ * @param int $id the identifier for that feed.
+ * @param bool $with_delim whether it should be terminated with nothing or
+ *      ? or &
+ * @param string $controller which controller is being used to access the
+ *      feed: usuall admin or group
+ * @return string url for the page in question
+ */
+function feedsUrl($type, $id, $with_delim = false, $controller = "group")
+{
+    if (C\REDIRECTS_ON && $controller == 'group') {
+        $delim = ($with_delim) ? "?" : "";
+        $path = ($type == "") ? "group" : "$type/$id";
+        return C\BASE_URL ."$path$delim";
+    } else {
+        $delim = ($with_delim) ? "&" : "";
+        $begin = (C\REDIRECTS_ON && $controller == "admin") ?
+            "admin?" : "?c=$controller&";
+        $query = "{$begin}a=groupFeeds";
+        $end = ($type == 'thread') ? "" : "_id";
+        if ($type != "") {
+            if ($begin == "admin?" && $type == "group") {
+                $query = "admin/$id";
+                $delim = "?";
+            } else {
+                $query .= "&just_{$type}$end=$id";
+            }
+        }
+        return C\BASE_URL . "$query$delim";
+    }
+}
+/**
+ * Used to route requests for the more and tools link on the landing page.
+ * If redirects on, then /more routes to this more tools page.
+ *
+ * @param array $route_args of url parts (split on slash).
+ * @return bool whether was able to compute a route or not
+ */
+function routeMore($route_args)
+{
+    $_REQUEST['c'] = "search";
+    $_REQUEST['a'] = "more";
+    $_REQUEST['route']['c'] = true;
+    $_REQUEST['route']['a'] = true;
+    return true;
+}
+/**
+ * Return the url for the more and tools link on the landing page making use of
+ * the defined variable REDIRECTS_ON.
+ *
+ * @param bool $with_delim whether it should be terminated with nothing or
+ *      ? or &
+ * @return string url for the page in question
+ */
+function moreUrl($with_delim = false)
+{
+    if (C\REDIRECTS_ON) {
+        $delim = ($with_delim) ? "?" : "";
+        return C\BASE_URL ."more$delim";
+    } else {
+        $delim = ($with_delim) ? "&" : "";
+        return C\BASE_URL . "?a=more$delim";
+    }
+}
+/**
+ * Used to route page requests to end-user controllers such as
+ * settings, register, admin. urls ending with /controller_name will
+ * be routed to that controller.
+ *
+ * @param array $route_args of url parts (split on slash).
+ * @return bool whether was able to compute a route or not
+ */
+function routeController($route_args)
+{
+    $_REQUEST['c'] = $route_args[0];
+    $_REQUEST['route']['c'] = true;
+    if (isset($route_args[1]) && intval($route_args[1]) == $route_args[1]) {
+        if (isset($_REQUEST['a']) && $_REQUEST['a'] == 'wiki') {
+            $_REQUEST['group_id'] = $route_args[1];
+        } else if (!empty($route_args[2])) {
+            $_REQUEST['a'] = 'wiki';
+            $_REQUEST['group_id'] = $route_args[1];
+            if ($route_args[2] == 'pages') {
+                $_REQUEST['arg'] = 'pages';
+                $_REQUEST['route']['arg'] = true;
+            } else {
+                $_REQUEST['page_name'] = $route_args[2];
+                $_REQUEST['route']['page_name'] = true;
+            }
+            $_REQUEST['route']['page_name'] = true;
+            $_REQUEST['route']['a'] = true;
+        } else {
+            $_REQUEST['a'] = 'groupFeeds';
+            $_REQUEST['just_group_id'] = $route_args[1];
+        }
+        $_REQUEST['route']['group_id'] = true;
+    }
+    return true;
+}
+/**
+ * Given the name of a controller for which an easy end-user link is useful
+ * creates the url where it can be accessed on this instance of Yioop,
+ * making use of the defined variable REDIRECTS_ON. Examples of end-user
+ * controllers would be the settings, admin, and register controllers.
+ *
+ * @param string $name of controller
+ * @param bool $with_delim whether it should be terminated with nothing or
+ *      ? or &
+ * @return string url for the page in question
+ */
+function controllerUrl($name, $with_delim = false)
+{
+    if (C\REDIRECTS_ON) {
+        $delim = ($with_delim) ? "?" : "";
+        $_REQUEST['route']['c'] = true;
+        return C\BASE_URL . $name . $delim;
+    } else {
+        $delim = ($with_delim) ? "&" : "";
+        return C\BASE_URL . "?c=$name$delim";
+    }
+}
+/**
+ * Used to route page requests for subsearches such as news, video, and images
+ * (site owner can define other). Urls of the form /s/subsearch will
+ * go the page handling the subsearch.
+ *
+ * @param array $route_args of url parts (split on slash).
+ * @return bool whether was able to compute a route or not
+ */
+function routeSubsearch($route_args)
+{
+    $handled = true;
+    if (isset($route_args[1])) {
+        $_REQUEST['route']['c'] = true;
+        $_REQUEST['route']['s'] = true;
+        $_REQUEST['c'] = "search";
+        $_REQUEST['s'] = $route_args[1];
+    } else {
+        $handled = false;
+    }
+    return $handled;
+}
+/**
+ * Given the name of a subsearch  creates the url where it can be accessed
+ * on this instance of Yioop, making use of the defined variable REDIRECTS_ON.
+ * Examples of subsearches include news, video, and images. A site owner
+ * can add to these and delete from these.
+ *
+ * @param string $name of subsearch
+ * @param bool $with_delim whether it should be terminated with nothing or
+ *      ? or &
+ * @return string url for the page in question
+ */
+function subsearchUrl($name, $with_delim = false)
+{
+    if (C\REDIRECTS_ON) {
+        $delim = ($with_delim) ? "?" : "";
+        return C\BASE_URL ."s/$name$delim";
+    } else {
+        $delim = ($with_delim) ? "&" : "";
+        return C\BASE_URL . "?s=$name$delim";
+    }
+}
+/**
+ * Used to route requests for the suggest-a-url link on the tools page.
+ * If redirects on, then /suugest routes to this suggest-a-url page.
+ *
+ * @param array $route_args of url parts (split on slash).
+ * @return bool whether was able to compute a route or not
+ */
+function routeSuggest($route_args)
+{
+    $_REQUEST['c'] = "register";
+    $_REQUEST['a'] = "suggestUrl";
+    return true;
+}
+/**
+ * Return the url for the suggest-a-url link on the more tools page, making use
+ * of the defined variable REDIRECTS_ON.
+ *
+ * @param bool $with_delim whether it should be terminated with nothing or
+ *      ? or &
+ * @return string url for the page in question
+ */
+function suggestUrl($with_delim = false)
+{
+    if (C\REDIRECTS_ON) {
+        $_REQUEST['route']['c'] = true;
+        $_REQUEST['route']['a'] = true;
+        $delim = ($with_delim) ? "?" : "";
+        return C\BASE_URL ."suggest$delim";
+    } else {
+        $delim = ($with_delim) ? "&" : "";
+        return C\BASE_URL . "?c=register&a=suggestUrl$delim";
+    }
+}
+/**
+ * Used to route page requests for pages corresponding to a wiki page of
+ * group. If it is a wiki page for the public group viewed without being
+ * logged in, the route might come in as yioop_instance/p/page_name if
+ * redirects are on. If it is for a non-public wiki or page accessed with
+ * logged in the url will look like either:
+ * yioop_instance/group/group_id?a=wiki&page_name=some_name
+ * or
+ * yioop_instance/admin/group_id?a=wiki&page_name=some_name&csrf_token_string
+ *
+ * @param array $route_args of url parts (split on slash).
+ * @return bool whether was able to compute a route or not
+ */
+function routeWiki($route_args)
+{
+    $handled = true;
+    if (isset($route_args[1])) {
+        if ($route_args[1] == 'pages') {
+            $_REQUEST['c'] = "group";
+            $_REQUEST['a'] = 'wiki';
+            $_REQUEST['arg'] = 'pages';
+            $_REQUEST['route']['c'] = true;
+            $_REQUEST['route']['a'] = true;
+            $_REQUEST['route']['arg'] = true;
+        } else {
+            $_REQUEST['c'] = "static";
+            $_REQUEST['p'] = $route_args[1];
+            $_REQUEST['route']['c'] = true;
+            $_REQUEST['route']['p'] = true;
+        }
+    } else {
+        $handled = false;
+    }
+    return $handled;
+}
+/**
+ * Given the name of a wiki page, the group it belongs to, and which
+ * controller is being used creates the url where that feed item can be
+ * accessed from the instance of Yioop. It makes use of the
+ * defined variable REDIRECTS_ON.
+ *
+ * @param string $name of wiki page
+ * @param bool $with_delim whether it should be terminated with nothing or
+ *      ? or &
+ * @param string $controller which controller is being used to access the
+ *      feed: usually static (for the public group), admin, or group
+ * @param int $id the group the wiki page belongs to
+ * @return string url for the page in question
+ */
+function wikiUrl($name, $with_delim = false, $controller = "static", $id =
+    C\PUBLIC_GROUP_ID)
+{
+    $q = ($with_delim) ? "?" : "";
+    $a = ($with_delim) ? "&" : "";
+    $is_static = ($controller == "static");
+    if (C\REDIRECTS_ON) {
+        $q = ($with_delim) ? "?" : "";
+        if ($is_static) {
+            if ($name == "") {
+                $name = "Main";
+            }
+            return C\BASE_URL ."p/$name$q";
+        } else {
+            $page = ($name== "") ? "?a=wiki$a" : "/$name$q";
+            return C\BASE_URL .
+                $controller . "/$id$page";
+        }
+    } else {
+        $delim = ($with_delim) ? "&" : "";
+        if ($name == 'pages') {
+            if ($is_static) {
+                $controller = $group;
+            }
+            return  C\BASE_URL .
+                "?c=$controller&a=wiki&arg=pages&group_id=$id$a";
+        } else {
+            if ($is_static) {
+                if ($name == "") {
+                    $name = "main";
+                }
+                return C\BASE_URL . "?c=static&p=$name$a";
+            } else {
+                $page = ($name== "") ? "" : "&page_name=$name";
+                return C\BASE_URL .
+                    "?c=$controller&a=wiki&group_id=$id$page$a";
+            }
+        }
+    }
+}
+if (!defined('seekquarry\\yioop\\configs\\SKIP_BOOTSTRAP')) {
+    bootstrap();
+}
--- a/src/library/AnalyticsManager.php
+++ b/src/library/AnalyticsManager.php
@ -0,0 +1,67 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\library\CrawlConstants;
+
+/**
+ * Used to set and get SQL query and search query timing statistic
+ * between models and index_bundle_iterators
+ *
+ * @author Chris Pollett
+ */
+class AnalyticsManager
+{
+    /**
+     * Where get and set field values are stored
+     * @var array
+     */
+    private static $data = [];
+    /**
+     * Used to get the timing statistic associated with $attribute
+     * @param string $attribute to get statistic for
+     * @return whatever was stored for that statistic
+     */
+    public static function get($attribute)
+    {
+        return isset(self::$data[$attribute]) ? self::$data[$attribute] : null;
+    }
+    /**
+     * Used to set the timing statistic $value associated with $attribute
+     * @param string $attribute to get statistic for
+     * @param mixed $value whatever timing information is to be associated with
+     *     value
+     */
+    public static function set($attribute, $value)
+    {
+        self::$data[$attribute] = $value;
+    }
+}
--- a/src/library/BTree.php
+++ b/src/library/BTree.php
@ -0,0 +1,779 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * This class implements the B-Tree data structure for storing int key based
+ * key-value pairs based on the algorithms in Introduction To Algorithms,
+ * by T.H. Cormen, C.E. Leiserson, R.L. Rivest, and C. Stein. Second
+ * Edition, 2001, The MIT Press
+ *
+ * @author Akshat Kukreti
+ */
+class BTree
+{
+    /**
+     * Default value of minimum degree. The minimum degree determines the
+     * minimum and maximum number of keys and child nodes, for nodes
+     * other than root node
+     */
+    const MIN_DEGREE = 501;
+    /**
+     * Minimum degree of the B-Tree. Used in determining the minimum/maximum
+     * keys and links a B-Tree node may have.
+     * minimum_keys = minimum_degree - 1
+     * minimum_links = minimum_keys + 1
+     * maximum_keys = 2 * minimum_degree - 1
+     * maximum_links = maximum_keys + 1
+     * @var int
+     */
+    public $min_degree;
+    /**
+     * Storage for root node of the B-Tree
+     * @var object
+     */
+    public $root;
+    /**
+     * Counter for node Ids
+     * @var int
+     */
+    public $id_count;
+    /**
+     * Directory for storing the B-Tree files
+     * @var string
+     */
+    public $dir;
+    /**
+     * Creates/Loads B-Tree having specified directory and minimum_degree. The
+     * default minimum_degree is 501.
+     * @param string $dir is the directory for storing the B-Tree files
+     * @param int $min_degree minimum degree of a B-tree node
+     */
+    public function __construct($dir, $min_degree = self::MIN_DEGREE)
+    {
+        $this->dir = $dir;
+        $this->min_degree = $min_degree;
+        if (!is_dir($this->dir)) {
+            mkdir($this->dir);
+            @chmod($this->dir, 0777);
+        }
+        $root_file = $this->dir."/root.txt";
+        if (file_exists($root_file)) {
+            $this->root = unserialize(file_get_contents($root_file));
+            $this->id_count = unserialize(file_get_contents($this->dir.
+                "/count.txt"));
+        } else {
+            $this->root = new BTNode();
+            $this->root->id = "root";
+            $this->id_count = 1;
+        }
+    }
+    /**
+     * Reads node from file saved on disk
+     * @param int $id is the Id of the node to be read
+     * @return object $node is the node
+     */
+    public function readNode($id)
+    {
+        $node_file = $this->dir."/$id.txt";
+        if (file_exists($node_file)) {
+            $node = unserialize(file_get_contents($node_file));
+            return $node;
+        } else {
+            crawlLog("Btree could not read node $id from disk");
+            return false;
+        }
+    }
+    /**
+     * Writes node to disk
+     * @param object $node is the node to be written to disk
+     */
+    public function writeNode($node)
+    {
+        $node_file = $this->dir."/{$node->id}.txt";
+        $contents = serialize($node);
+        file_put_contents($node_file, $contents);
+        @chmod($node_file, 0777);
+    }
+    /**
+     * Writes the root node of this btree to disk
+     */
+    public function writeRoot()
+    {
+        $this->writeNode($this->root);
+    }
+    /**
+     * Deletes file associated with given node from disk
+     * @param int $id is the id of the node whose file is to be deleted
+     */
+    public function deleteNodeFile($id)
+    {
+        $node_file = $this->dir."/$id.txt";
+        if (file_exists($node_file)) {
+            unlink($node_file);
+        } else {
+            crawlLog("Could not delete node $id from disk");
+        }
+    }
+    /**
+     * Saves value of node id counter
+     * @param int $count is the id counter
+     */
+    public function saveNodeCount()
+    {
+        $count_file = $this->dir."/count.txt";
+        $node_count = serialize($this->id_count);
+        file_put_contents($count_file, $node_count);
+    }
+    /**
+     * Deletes the node id count file
+     */
+    public function deleteCount()
+    {
+        unlink($this->dir."/count.txt");
+    }
+    /**
+     * Returns key-value pair in the B-Tree based on key
+     * @param int $key is the key for whicht the key-value pair is to be
+     * found
+     * @return array key-value pair associated with $key or null if the
+     * key-value pair is not found in the tree.
+     */
+    public function findValue($key)
+    {
+        list($node, $flag, $pos) = $this->search($this->root, $key);
+        if ($pos !== null) {
+            if ($flag == 1) {
+                return $node->keys[$pos];
+            } else {
+                return null;
+            }
+        } else {
+            return null;
+        }
+    }
+    /**
+     * Searches for key-value pair for a given key in a node. If key value pair
+     * is not found in the node, recursively searches in the root node of the
+     * sub-tree till the pair is found. Search stops at leaf nodes.
+     * @param object $node is the B-Tree node from where the search starts
+     * @param int $key is the key for which the key-value pair is to be
+     * searched
+     */
+    public function search($node, $key)
+    {
+        $flag = -1;
+        if (empty($node->keys)) {
+            return [$node, $flag, null];
+        } else {
+            list($flag, $pos) = $this->binarySearch($node->keys, $key);
+            if ($flag == 1) {
+                return [$node, $flag, $pos];
+            }
+            if ($node->is_leaf == true) {
+                return [$node, $flag, $pos];
+            } else {
+                $next_id = $node->links[$pos];
+                $next_node = $this->readNode($next_id);
+                return $this->search($next_node, $key);
+            }
+        }
+    }
+    /**
+     * Inserts a key-value pair in the B-Tree
+     * @param array $pair is the key-value pair to be inserted
+     */
+    public function insert($pair)
+    {
+        $node = $this->root;
+        if (empty($node->keys)) {
+            $node->keys = [$pair];
+            $node->count = count($node->keys);
+            $this->writeNode($node);
+            $this->saveNodeCount();
+        } else if ($node->count == 2 * $this->min_degree - 1) {
+            $temp = $this->createEmptyParentNode();
+            $this->root = $temp;
+            $this->swap($temp->id, $node->id);
+            $temp->links[0] = $node->id;
+            $this->bTreeSplitChild($temp, 0, $node);
+            $this->insertNodeNotFull($temp, $pair);
+        } else {
+            $this->insertNodeNotFull($node, $pair);
+        }
+    }
+    /**
+     * Inserts a key-value pair in a leaf node that is not full. Searches for
+     * the appropriate leaf node, splitting full nodes before descending
+     * down the tree recursively.
+     * @param object $node is the node from where the search for the leaf node
+     * begins
+     * @param array $pair is the key-value pair
+     */
+    public function insertNodeNotFull($node, $pair)
+    {
+        $key = $pair[0];
+        $i = $node->count - 1;
+        list($flag, $pos) = $this->binarySearch($node->keys, $key);
+        if ($node->is_leaf == true) {
+            if ($flag == 1) {
+                $node->keys[$pos] = $pair;
+                $this->writeNode($node);
+            } else {
+                while($i >= 0 && $node->keys[$i][0] > $key) {
+                    $node->keys[$i + 1] = $node->keys[$i];
+                    $i -= 1;
+                }
+                $node->keys[$i + 1] = $pair;
+                $node->count = count($node->keys);
+                $this->writeNode($node);
+            }
+        } else {
+            if ($flag == 1) {
+                $node->keys[$pos] = $pair;
+                $this->writeNode($node);
+            } else {
+                while($i >= 0 && $node->keys[$i][0] > $key) {
+                    $i -= 1;
+                }
+                $i += 1;
+                $next_node = $this->readNode($node->links[$i]);
+                if ($next_node->count == 2 * $this->min_degree - 1) {
+                    $this->bTreeSplitChild($node, $i, $next_node);
+                    if ($key > $node->keys[$i][0]) {
+                        $i += 1;
+                        $next_node = $this->readNode($node->links[$i]);
+                    }
+                }
+                $this->insertNodeNotFull($next_node, $pair);
+            }
+        }
+    }
+    /**
+     * Splits a full node into two child node. The median key-value pair is
+     * added to the parent node of the node being split.
+     *
+     * @param object $parent is the parent node
+     * @param int $i is the link to child node
+     * @param object $child is the child node
+     */
+    public function bTreeSplitChild($parent, $i, $child)
+    {
+        $this->id_count += 1;
+        $temp = new BTNode();
+        $temp->id = $this->id_count;
+        $this->saveNodeCount();
+        $temp->is_leaf = $child->is_leaf;
+        $temp->count = $this->min_degree - 1;
+        for ($j = 0;$j < $this->min_degree - 1;$j++) {
+            $temp->keys[$j] = $child->keys[$this->min_degree + $j];
+        }
+        if ($child->is_leaf == false) {
+            for ($j = 0;$j < $this->min_degree;$j++) {
+                $temp->links[$j] = $child->links[$this->min_degree + $j];
+            }
+        }
+        for ($j = $parent->count;$j > $i;$j--) {
+            $parent->links[$j + 1] = $parent->links[$j];
+        }
+        $parent->links[$j + 1] = $temp->id;
+        for ($j = $parent->count - 1;$j >= $i;$j--) {
+            $parent->keys[$j + 1] = $parent->keys[$j];
+        }
+        $parent->keys[$j + 1] = $child->keys[$this->min_degree - 1];
+        $parent->count = count($parent->keys);
+        $child->keys = array_slice($child->keys, 0, $this->min_degree - 1);
+        if ($child->is_leaf == false) {
+            $child->links = array_slice($child->links, 0, $this->min_degree);
+        }
+        $child->count = count($child->keys);
+        $this->writeNode($child);
+        $this->writeNode($temp);
+        $this->writeNode($parent);
+    }
+    /**
+     * Swaps value of two variables
+     * @param $x is the first variable
+     * @param $y is the second variable
+     */
+    public function swap(&$x, &$y)
+    {
+        $temp = $x;
+        $x = $y;
+        $y = $temp;
+    }
+    /**
+     * Creates an empty non-leaf node
+     * @return object $node is the non-leaf node
+     */
+    public function createEmptyParentNode()
+    {
+        $this->id_count += 1;
+        $temp = new BTNode();
+        $temp->id = $this->id_count;
+        $this->saveNodeCount();
+        $temp->is_leaf = false;
+        return $temp;
+    }
+    /**
+     * Performs binary search for a integer key on an array of integer key based
+     * key-value pairs
+     * @param array $keys is an array containing key-value pairs
+     * @param int $key is the key
+     * @return array containing flag indicating it the value was found or not,
+     * and the position equal to, or nearest to the position of the key being
+     * searched
+     */
+    public function binarySearch($keys, $key)
+    {
+        $low = 0;
+        $high = count($keys) - 1;
+        $flag = -1;
+        while($high >= $low) {
+            $middle = (int)floor(($high + $low) / 2);
+            if ($key == $keys[$middle][0]) {
+                $flag = 1;
+                return [$flag, $middle];
+            } else if ($key > $keys[$middle][0]) {
+                $low = $middle + 1;
+            } else {
+                $high = $middle - 1;
+            }
+        }
+        return [$flag, $low];
+    }
+    /**
+     * Removes a key-value pair from the B-Tree
+     * @param int $key associated with the key-value pair to be deleted
+     */
+    public function remove($key)
+    {
+        $this->delete($this->root, $key);
+    }
+    /**
+     * Deletes a key-value pair from the B-Tree from a node.
+     * Handles deletion from leaf node and internal node. If the key-value pair
+     * is not found in an internal node. The recrusion descends to the root
+     * of the sub-tree until a leaf node is encoutered that does not have the
+     * key-value pair to be deleted.
+     * @param object $node is from where the key search starts
+     * @param int $key is the key to be deleted
+     */
+    public function delete($node, $key)
+    {
+        list($flag, $pos) = $this->binarySearch($node->keys, $key);
+        if ($flag == 1 && $node->is_leaf == false) {
+            $this->reArrange($node, $pos);
+        }
+        list($flag, $pos) = $this->binarySearch($node->keys, $key);
+        if ($flag == 1 && $node->is_leaf == true) {
+                $this->deleteFromLeaf($node, $pos);
+        } else if ($flag == 1 && $node->is_leaf == false) {
+                $this->deleteFromNonLeaf($node, $pos);
+        } else if ($flag !== 1 && $node->is_leaf == false) {
+            $sub_tree_root = $this->getDescendant($node, $pos);
+            $this->delete($sub_tree_root, $key);
+        }
+    }
+    /**
+     * Shifts a key from a non-leaf root to it's child node using nodes
+     * preceding and next to the key-value pair to be deleted. If the
+     * preceding child node has atleast minimum MIN_DEGREE keys, a the last
+     * key-value pair from the preceding node is moved to the position of the
+     * key-value pair that is to be deleted. Otherwise the same process is done
+     * using the first key-value pair of the child node next to the key-value
+     * pair to be deleted.
+     * @param object $node is the internal node containing the key-value pair to
+     * be deleted
+     * @param int $pos is the position of the key-value pair within $pos.
+     */
+    public function reArrange(&$node, $pos)
+    {
+        $pred_id = $node->links[$pos];
+        $pred = $this->readNode($pred_id);
+        $next_id = $node->links[$pos + 1];
+        $next = $this->readNode($next_id);
+        if ($pred->count >= $this->min_degree) {
+            $this->adjustChildUsingLeftSiblingAndParent($node, $next, $pred,
+                $pos + 1);
+        } else if ($next->count >= $this->min_degree) {
+            $this->adjustChildUsingRightSiblingAndParent($node, $pred,
+                $next, $pos);
+        }
+    }
+    /**
+     * Deletes key-value pair from a leaf node in a B-Tree
+     * @param object& $node is the leaf node containing the key-value pair
+     * @param int $pos in node to delete
+     */
+    public function deleteFromLeaf(&$node, $pos)
+    {
+        if ($pos == $node->count - 1) {
+            array_pop($node->keys);
+            $node->count -= 1;
+            $this->writeNode($node);
+        } else {
+            for ($i = $pos + 1; $i < $node->count; $i++) {
+                $node->keys[$i - 1] = $node->keys[$i];
+            }
+            $node->keys = array_slice($node->keys, 0, $node->count - 1);
+            $node->count -= 1;
+            $this->writeNode($node);
+        }
+        if ($node == $this->root && $node->count == 0) {
+            $this->deleteNodeFile("root");
+            $this->deleteCount();
+        }
+    }
+    /**
+     * Deletes key-value pair from a non-leaf node in a B-Tree
+     * @param object& $node is the non-leaf node containing the key-value pair
+     * @param int $pos link position in node to delete
+     */
+    public function deleteFromNonLeaf(&$node, $pos)
+    {
+        $pred_id = $node->links[$pos];
+        $pred = $this->readNode($pred_id);
+        if ($pred->count >= $this->min_degree) {
+            $pred_pair = $pred->keys[$pred->count - 1];
+            $pred_key = $pred_pair[0];
+            $this->delete($pred, $pred_key);
+            $node->keys[$pos] = $pred_pair;
+            $this->writeNode($node);
+        } else {
+            $next_id = $node->links[$pos + 1];
+            $next = $this->readNode($next_id);
+            if ($next->count >= $this->min_degree) {
+                $next_pair = $next->keys[0];
+                $next_key = $next_pair[0];
+                $this->delete($next, $next_key);
+                $node->keys[$pos] = $next_pair;
+                $this->writeNode($node);
+            } else {
+                $node_pair = $node->keys[$pos];
+                $node_key = $node_pair[0];
+                $pred->keys[$pred->count] = $node_pair;
+                $pred->count += 1;
+                if ($pos == $node->count - 1) {
+                    array_pop($node->keys);
+                    array_pop($node->links);
+                    $node->count -= 1;
+                } else {
+                    for ($i = $pos + 1;$i < $node->count;$i++) {
+                        $node->keys[$i - 1] = $node->keys[$i];
+                    }
+                    $node->keys = array_slice($node->keys, 0, $node->count - 1);
+                    for ($i = $pos + 2;$i <= $node->count;$i++) {
+                        $node->links[$i - 1] = $node->links[$i];
+                    }
+                    $node->links = array_slice($node->links, 0, $node->count);
+                    $node->count -= 1;
+                }
+                for ($i = 0;$i < $next->count;$i++) {
+                    $pred->keys[$pred->count + $i] = $next->keys[$i];
+                }
+                if ($next->is_leaf == false) {
+                    for ($i = 0;$i <= $next->count;$i++) {
+                        $pred->links[$pred->count + $i] = $next->links[$i];
+                    }
+                }
+                $pred->count += $next->count;
+                $this->writeNode($pred);
+                $this->deleteNodeFile($next->id);
+                if ($node == $this->root && $node->count == 0) {
+                    $old_id = $pred->id;
+                    $pred->id = "root";
+                    $this->root = $pred;
+                    $this->deleteNodeFile($old_id);
+                    $this->writeNode($this->root);
+                } else {
+                    $this->writeNode($node);
+                }
+                $this->delete($pred, $node_key);
+            }
+        }
+    }
+    /**
+     * If the key to be deleted is not found in an internal node, finds the root
+     * of the sub-tree that might contain the key to be deleted. If the node
+     * contains atleast $min_degree number of keys, the node is returned.
+     * Otherwise, the node is adjusted using one of its sibling nodes and the
+     * parent node so that the resultant node has $min_degree keys.
+     * @param object $parent is the parent node
+     * @param int $pos is the link to the root of the sub-tree
+     * @return object $child is the child node to which the recursion will
+     * descend
+     */
+    public function getDescendant($parent, $pos)
+    {
+        $child_id = $parent->links[$pos];
+        $child = $this->readNode($child_id);
+        if ($child->count == $this->min_degree - 1) {
+            $siblings = $this->getSiblings($parent, $pos);
+            if ($siblings[0] !== -1 && $siblings[1] !== -1) {
+                $pred_id = $siblings[0];
+                $pred = $this->readNode($pred_id);
+                if ($pred->count >= $this->min_degree) {
+                    $this->adjustChildUsingLeftSiblingAndParent($parent, $child,
+                        $pred, $pos);
+                    return $child;
+                } else {
+                    $next_id = $siblings[1];
+                    $next = $this->readNode($next_id);
+                    if ($next->count >= $this->min_degree) {
+                        $this->adjustChildUsingRightSiblingAndParent($parent,
+                            $child, $next, $pos);
+                        return $child;
+                    } else {
+                        if ($pred->count <= $next->count) {
+                            $this->mergeChildWithParentKeyAndRightSibling(
+                                $parent, $pred, $child, $pos - 1);
+                            return $pred;
+                        } else {
+                            $this->mergeChildWithParentKeyAndRightSibling(
+                                $parent, $child, $next, $pos);
+                            return $child;
+                        }
+                    }
+                }
+            } else if ($siblings[0] !== -1) {
+                $pred_id = $siblings[0];
+                $pred = $this->readNode($pred_id);
+                if ($pred->count >= $this->min_degree) {
+                    $this->adjustChildUsingLeftSiblingAndParent($parent, $child,
+                        $pred, $pos);
+                    return $child;
+                } else {
+                    $this->mergeChildWithParentKeyAndRightSibling($parent,
+                        $pred, $child, $pos - 1);
+                    return $pred;
+                }
+            } else {
+                $next_id = $siblings[1];
+                $next = $this->readNode($next_id);
+                if ($next->count >= $this->min_degree) {
+                    $this->adjustChildUsingRightSiblingAndParent($parent,
+                        $child, $next, $pos);
+                    return $child;
+                } else {
+                    $this->mergeChildWithParentKeyAndRightSibling($parent,
+                        $child, $next, $pos);
+                    return $child;
+                }
+            }
+        } else return $child;
+    }
+    /**
+     * Gives a child node an extra key by moving a key from the parent to the
+     * child node, and by moving a key from the child's left sibling to the
+     * parent node
+     * @param object $parent is the parent node
+     * @param object $child is the child node
+     * @param object $pred is the $child's left sibling node
+     * @param $pos is the link from $parent to $child
+     */
+    public function adjustChildUsingLeftSiblingAndParent(&$parent, &$child,
+        &$pred, $pos)
+    {
+        $pred_pair = array_pop($pred->keys);
+        $pred_link = -1;
+        if ($pred->is_leaf == false) {
+            $pred_link = array_pop($pred->links);
+        }
+        $pred->count -= 1;
+        $this->writeNode($pred);
+        $parent_pair = $parent->keys[$pos - 1];
+        for ($i = $child->count - 1;$i >= 0;$i--) {
+            $child->keys[$i + 1] = $child->keys[$i];
+        }
+        $child->keys[0] = $parent_pair;
+        if ($child->is_leaf == false) {
+            for ($i = $child->count;$i >= 0;$i--) {
+                $child->links[$i + 1] = $child->links[$i];
+            }
+            $child->links[0] = $pred_link;
+        }
+        $child->count += 1;
+        $this->writeNode($child);
+        $parent->keys[$pos - 1] = $pred_pair;
+        $this->writeNode($parent);
+    }
+    /**
+     * Gives a child node an extra key by moving a key from the parent to the
+     * child node, and by moving a key from the child's right sibling to the
+     * parent node
+     * @param object& $parent is the parent node
+     * @param object& $child is the child node
+     * @param object& $next is the $child's right sibling node
+     * @param int $pos is the link from $parent to $child
+     */
+    public function adjustChildUsingRightSiblingAndParent(&$parent, &$child,
+        &$next, $pos)
+    {
+        $next_pair = $next->keys[0];
+        $next_link = -1;
+        for ($i = 1;$i < $next->count;$i++) {
+            $next->keys[$i - 1] = $next->keys[$i];
+        }
+        $next->keys = array_slice($next->keys, 0, $next->count - 1);
+        if ($next->is_leaf == false) {
+            $next_link = $next->links[0];
+            for ($i = 1;$i <= $next->count;$i++) {
+                $next->links[$i - 1] = $next->links[$i];
+            }
+            $next->links = array_slice($next->links, 0, $next->count);
+        }
+        $next->count -= 1;
+        $this->writeNode($next);
+        $parent_pair = $parent->keys[$pos];
+        $child->keys[$child->count] = $parent_pair;
+        $child->count += 1;
+        if ($child->is_leaf == false) {
+            $child->links[$child->count] = $next_link;
+        }
+        $this->writeNode($child);
+        $parent->keys[$pos] = $next_pair;
+        $this->writeNode($parent);
+    }
+    /**
+     * Merges the child node with it's right sibling. The separating key in the
+     * parent node is added as the median key to the newly formed node
+     * @param object $parent is the parent node
+     * @param object $child is the child node
+     * @param object $next is the $child's right sibling node
+     * @param $pos is the link from $parent to $child
+     */
+    public function mergeChildWithParentKeyAndRightSibling(&$parent, &$child,
+        &$next, $pos)
+    {
+        $parent_pair = $parent->keys[$pos];
+        $child->keys[$child->count] = $parent_pair;
+        $child->count += 1;
+        for ($i = 0;$i < $next->count;$i++) {
+            $child->keys[$child->count + $i] = $next->keys[$i];
+        }
+        if ($next->is_leaf == false) {
+            for ($i = 0;$i <= $next->count;$i++) {
+                $child->links[$child->count + $i] = $next->links[$i];
+            }
+        }
+        $child->count = count($child->keys);
+        $this->writeNode($child);
+        $this->deleteNodeFile($next->id);
+        if ($pos == $parent->count - 1) {
+            array_pop($parent->keys);
+            array_pop($parent->links);
+            $parent->count -= 1;
+        } else {
+            for ($i = $pos + 1;$i < $parent->count;$i++) {
+                $parent->keys[$i - 1] = $parent->keys[$i];
+            }
+            $parent->keys = array_slice($parent->keys, 0, $parent->count - 1);
+            for ($i = $pos + 2;$i <= $parent->count;$i++) {
+                $parent->links[$i - 1] = $parent->links[$i];
+            }
+            $parent->links = array_slice($parent->links, 0, $parent->count);
+            $parent->count -= 1;
+        }
+        if ($parent == $this->root && $parent->count == 0) {
+            $old_id = $child->id;
+            $child->id = "root";
+            $this->root = $child;
+            $this->deleteNodeFile($old_id);
+            $this->writeNode($this->root);
+        } else {
+            $this->writeNode($parent);
+        }
+    }
+    /**
+     * Gets the siblings ids based on link in parent node
+     * @param object $parent is the parent node
+     * @param int $pos is the link for which the siblings are to be found
+     */
+    public function getSiblings($parent, $pos)
+    {
+        $siblings = [];
+        if ($pos > 0 && $pos < $parent->count) {
+            $siblings[] = $parent->links[$pos - 1];
+            $siblings[] = $parent->links[$pos + 1];
+        } else if ($pos == 0) {
+            $siblings[] = -1;
+            $siblings[] = $parent->links[$pos + 1];
+        } else {
+            $siblings[] = $parent->links[$pos - 1];
+            $siblings[] = -1;
+        }
+        return $siblings;
+    }
+}
+/**
+ * Class for B-Tree nodes
+ */
+class BTNode
+{
+    /**
+     * Storage for id of a B-Tree node
+     * @var int
+     */
+    public $id;
+    /**
+     * Flag for checking if node is a leaf node or internal node
+     * @var boolean
+     */
+    public $is_leaf;
+    /**
+     * Storage for keeping track of node ids
+     * @var int
+     */
+    public $count;
+    /**
+     * Storage for key-value pairs in a B-Tree node
+     * @var array
+     */
+    public $keys;
+    /**
+     * Storage for links to child nodes in a B-Tree node
+     * @var array
+     */
+    public $links;
+    /**
+     * Creates and initializes an empty leaf node with id -1
+     * @var int
+     */
+    public function __construct()
+    {
+        $this->id = -1;
+        $this->is_leaf = true;
+        $this->count = 0;
+        $this->keys = null;
+        $this->links = null;
+    }
+}
--- a/src/library/BloomFilterBundle.php
+++ b/src/library/BloomFilterBundle.php
@ -0,0 +1,235 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ *
+ * A BloomFilterBundle is a directory of BloomFilterFile.
+ * The filter bundle, like a Bloom filter, also acts as a set,
+ * but once the active filter in it fills up a new filter is
+ * added to the bundle so that more data can be stored.
+ *
+ * @author Chris Pollett
+ * @see BloomFilterFile
+ */
+class BloomFilterBundle
+{
+    /**
+     * Reference to the filter which will be used to store new data
+     * @var object
+     */
+    public $current_filter;
+    /**
+     * Total number of filter that this filter bundle currently has
+     * @var int
+     */
+    public $num_filters;
+    /**
+     * The number of items which have been stored in the current filter
+     * @var int
+     */
+    public $current_filter_count;
+    /**
+     * The maximum capacity of a filter in this filter bundle
+     * @var int
+     */
+    public $filter_size;
+    /**
+     * The folder name of this filter bundle
+     * @var string
+     */
+    public $dir_name;
+    /**
+     * The default maximum size of a filter in a filter bundle
+     */
+    const default_filter_size = 10000000;
+    /**
+     * Creates or loads if already exists the directory structure and
+     * BloomFilterFiles used by this bundle
+     *
+     * @param $dir_name directory when this bundles data is stored
+     * @param $filter_size the size of an individual filter in this bundle
+     *     once a filter is filled a new one is added to the directory
+     */
+    public function __construct($dir_name,
+        $filter_size = self::default_filter_size )
+    {
+        $this->dir_name = $dir_name;
+        if (!is_dir($dir_name)) {
+            mkdir($dir_name);
+        }
+        $this->loadMetaData();
+        if ($this->num_filters == 0) {
+            $this->current_filter =
+                new BloomFilterFile($dir_name."/filter_0.ftr", $filter_size);
+            $this->num_filters++;
+            $this->filter_size = $filter_size;
+            $this->current_filter->save();
+            $this->saveMetaData();
+        } else {
+            $last_filter = $this->num_filters - 1;
+            $this->current_filter =
+                BloomFilterFile::load($dir_name."/filter_$last_filter.ftr");
+        }
+    }
+    /**
+     * Inserts a $value into the BloomFilterBundle
+     *
+     * This involves inserting into the current filter, if the filter
+     * is full, a new filter is added before the value is added
+     *
+     * @param string $value a item to add to the filter bundle
+     */
+    public function add($value)
+    {
+        if ($this->current_filter_count >= $this->filter_size) {
+            $this->current_filter->save();
+            $this->current_filter = null;
+            gc_collect_cycles();
+            $last_filter = $this->num_filters;
+            $this->current_filter =
+                new BloomFilterFile($this->dir_name."/filter_$last_filter.ftr",
+                    $this->filter_size);
+            $this->current_filter_count = 0;
+            $this->num_filters++;
+            $this->saveMetaData();
+        }
+        $this->current_filter->add($value);
+        $this->current_filter_count++;
+    }
+    /**
+     * Removes from the passed array those elements $elt who either are in
+     * the filter bundle or whose $elt[$field_name] is in the bundle.
+     *
+     * @param array& $arr the array to remove elements from
+     * @param array $field_names if not null an array of field names of $arr
+     *     to use to do filtering
+     */
+    public function differenceFilter(&$arr, $field_names = null)
+    {
+        $incremental_time = microtime(true);
+        $num_filters = $this->num_filters;
+        $count = count($arr);
+        for ($i = 0; $i < $num_filters; $i++) {
+            if ($i == $num_filters - 1) {
+                $tmp_filter = $this->current_filter;
+            } else {
+                $tmp_filter =
+                    BloomFilterFile::load($this->dir_name."/filter_$i.ftr");
+            }
+
+            for ($j = 0; $j < $count; $j++) {
+                if ($field_names === null) {
+                    $tmp = & $arr[$j];
+                    if ($tmp !== false && $tmp_filter->contains($tmp)) {
+                    /*
+                        We deliberately don't try to add anything that has
+                        the hash field set to false. This is our cue to
+                        skip an element such as a link document which we
+                        know will almost always be unique and so be unnecessary
+                        to de-duplicate
+                     */
+                        unset($arr[$j]);
+                    }
+                } else { //now do the same strategy for the array of fields case
+                    foreach ($field_names as $field_name) {
+                        $tmp = & $arr[$j][$field_name];
+                        if ($tmp !== false && $tmp_filter->contains($tmp)) {
+                            unset($arr[$j]);
+                            break;
+                        }
+                    }
+                }
+                if (changeInMicrotime($incremental_time) > 30 ) {
+                    crawlLog("..Processing item $j of $count from filter ".
+                        "number $i of $num_filters.");
+                    $incremental_time = microtime(true);
+                }
+            }
+        }
+    }
+    /**
+     * Loads from the filter bundles' meta.txt the meta data associated with
+     * this filter bundle and stores this data into field variables
+     */
+    public function loadMetaData()
+    {
+        if (file_exists($this->dir_name.'/meta.txt')) {
+            $meta = unserialize(
+                file_get_contents($this->dir_name.'/meta.txt') );
+            $this->num_filters = $meta['NUM_FILTERS'];
+            $this->current_filter_count = $meta['CURRENT_FILTER_COUNT'];
+            $this->filter_size = $meta['FILTER_SIZE'];
+        } else {
+            $this->num_filters = 0;
+            $this->current_filter_count = 0;
+            $this->filter_size = self::default_filter_size;
+        }
+    }
+    /**
+     * Saves the meta data (number of filter, number of items stored, and size)
+     * of the bundle
+     */
+    public function saveMetaData()
+    {
+        $meta = [];
+        $meta['NUM_FILTERS'] = $this->num_filters;
+        $meta['CURRENT_FILTER_COUNT' ]= $this->current_filter_count;
+        $meta['FILTER_SIZE'] = $this->filter_size;
+        file_put_contents($this->dir_name.'/meta.txt', serialize($meta));
+    }
+    /**
+     * Empties the contents of the bloom filter bundle and resets
+     * it to start storing new data.
+     */
+    public function reset()
+    {
+        for ($i = 0; $i < $this->num_filters; $i++) {
+            @unlink($this->dir_name."/filter_$i.ftr");
+        }
+        $this->num_filters = 0;
+        $this->current_filter_count = 0;
+        $this->current_filter =
+            new BloomFilterFile($this->dir_name."/filter_0.ftr",
+            $this->filter_size);
+        $this->num_filters++;
+        $this->current_filter->save();
+        $this->saveMetaData();
+    }
+    /**
+     * Used to save to disk all the file data associated with this bundle
+     */
+    public function forceSave()
+    {
+        $this->saveMetaData();
+        $this->current_filter->save();
+    }
+}
--- a/src/library/BloomFilterFile.php
+++ b/src/library/BloomFilterFile.php
@ -0,0 +1,167 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * For packInt/unpackInt
+ */
+require_once __DIR__."/Utility.php";
+
+/**
+ * Code used to manage a bloom filter in-memory and in file.
+ * A Bloom filter is used to store a set of objects.
+ * It can support inserts into the set and it can also be
+ * used to check membership in the set.
+ *
+ * @author Chris Pollett
+ */
+class BloomFilterFile extends PersistentStructure
+{
+    /**
+     * Number of bit positions in the Bloom filter used to say an item is
+     * in the filter
+     * @var int
+     */
+    public $num_keys;
+    /**
+     * Size in bits of the packed string array used to store the filter's
+     * contents
+     * @var int
+     */
+    public $filter_size;
+    /**
+     * Packed string used to store the Bloom filters
+     * @var string
+     */
+    public $filter;
+    /**
+     * Initializes the fields of the BloomFilter and its base
+     * PersistentStructure.
+     *
+     * @param string $fname name of the file to store the BloomFilter data in
+     * @param int $num_values the maximum number of values that will be stored
+     *     in the BloomFilter. Filter will be sized so the odds of a false
+     *     positive are roughly one over this value
+     * @param int $save_frequency how often to store the BloomFilter to disk
+     */
+    public function __construct($fname, $num_values,
+        $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
+    {
+        $log2 = log(2);
+        $this->num_keys = ceil(log($num_values)/$log2);
+        $this->filter_size = ceil( ($this->num_keys) * $num_values/$log2 );
+        $mem_before =  memory_get_usage(true);
+        $this->filter = pack("x". ceil(0.125 * $this->filter_size));
+            // 1/8 =.125 = num bits/bytes, want to make things floats
+        $mem = memory_get_usage(true) - $mem_before;
+        parent::__construct($fname, $save_frequency);
+    }
+    /**
+     * Inserts the provided item into the Bloomfilter
+     *
+     * @param string $value item to add to filter
+     */
+    public function add($value)
+    {
+        $num_keys = $this->num_keys;
+        $pos_array = $this->getHashBitPositionArray($value, $num_keys);
+        for ($i = 0;  $i < $num_keys; $i++) {
+            $this->setBit($pos_array[$i]);
+        }
+        $this->checkSave();
+    }
+    /**
+     * Checks if the BloomFilter contains the provided $value
+     *
+     * @param string $value item to check if is in the BloomFilter
+     * @return bool whether $value was in the filter or not
+     */
+    public function contains($value)
+    {
+        $num_keys = $this->num_keys;
+        $pos_array = $this->getHashBitPositionArray($value, $num_keys);
+        for ($i = 0;  $i < $num_keys; $i++) {
+            if (!$this->getBit($pos_array[$i])) {
+                return false;
+            }
+        }
+        return true;
+    }
+    /**
+     * Hashes $value to a bit position in the BloomFilter
+     *
+     * @param string $value value to map to a bit position in the filter
+     * @param int $num_keys number of bit positions in the Bloom filter
+     *      used to say an item isin the filter
+     * @return int the bit position mapped to
+     */
+    public function getHashBitPositionArray($value, $num_keys)
+    {
+        $offset = ($num_keys >> 2) + 1;
+        $rand_string = "";
+        for ($i = 0 ; $i < $offset; $i++) {
+            $value = md5($value, true);
+            $rand_string .= $value;
+        }
+        $seed = array_values(unpack("N*", $rand_string));
+        $pos_array = [];
+        $size = $this->filter_size >> 1;
+        $less_one = $size - 1;
+        for ($i = 0; $i < $num_keys; $i++) {
+            $pos_array[$i] = ($seed[$i] % $size) + $less_one;
+        }
+        return $pos_array;
+    }
+    /**
+     * Sets to true the ith bit position in the filter.
+     *
+     * @param int $i the position to set to true
+     */
+    public function setBit($i)
+    {
+        $byte = ($i >> 3);
+        $bit_in_byte = $i - ($byte << 3);
+        $tmp = $this->filter[$byte];
+        $this->filter[$byte] = $tmp | chr(1 << $bit_in_byte);
+    }
+    /**
+     * Looks up the value of the ith bit position in the filter
+     *
+     * @param int $i the position to look up
+     * @return bool the value of the looked up position
+     */
+    public function getBit($i)
+    {
+        $byte = $i >> 3;
+        $bit_in_byte = $i - ($byte << 3);
+        return ($this->filter[$byte] & chr(1 << $bit_in_byte)) != chr(0);
+    }
+}
--- a/src/library/BrowserRunner.php
+++ b/src/library/BrowserRunner.php
@ -0,0 +1,80 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ * LICENSE:
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * END LICENSE
+ *
+ * @author Eswara Rajesh Pinapala epinapala@live.com
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Used to execute browser-based Javascript and browser page rendering from PHP.
+ *
+ * @author Eswara Rajesh Pinapala
+ */
+class BrowserRunner
+{
+    /**
+     * Tests if there is a headless browser (typically Phantom JS) available
+     * before constructing this kind of object. If not, it throws an exceptio
+     */
+    public function __construct()
+    {
+        $version = $this->execute("-v");
+        if (!$version) {
+            throw new \Exception("BrowserRunner currently requires PhantomJS ".
+                "package to run");
+        }
+    }
+    /**
+     * Runs a Javascript in the current headless browser instance and
+     * return the results as either a JSON or PHP object.
+     * @param string $script Javascript to run in browser
+     * @param string $decode_json whether to leave result as is or to convert
+     *      from JSON to a PHP object
+     */
+    public function execute($script, $decode_json = false)
+    {
+        $command = C\PHANTOM_JS." " . implode(' ', func_get_args());
+        $shell_result = shell_exec(escapeshellcmd($command));
+        if ($shell_result === null) {
+            return false;
+        }
+        if ($decode_json) {
+            if (substr($shell_result, 0, 1) !== '{') {
+                //return if the result is not a JSON.
+                return $shell_result;
+            } else {
+                //If the result is a JSON, decode JSON into a PHP array.
+                $json = json_decode($shell_result, true);
+                if ($json === null) {
+                    return false;
+                }
+                return $json;
+            }
+        } else {
+            return $shell_result;
+        }
+    }
+}
--- a/src/library/Bzip2BlockIterator.php
+++ b/src/library/Bzip2BlockIterator.php
@ -0,0 +1,334 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Shawn Tice, (docs added by Chris Pollett chris@pollett.org)
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * This class is used to allow one to iterate through a Bzip2 file.
+ * The main advantage of using this class over the built-in bzip is that
+ * it can "remember" where it left off between serializations. So can
+ * continue where left off between web invocations. This is used in
+ * doing archive crawls of wiki dumps to allow the name server picks up where
+ * it left off.
+ *
+ * @author Shawn Tice, (some docs added by Chris Pollett chris@pollett.org)
+ */
+class BZip2BlockIterator
+{
+    /**
+     * File handle for bz2 file
+     * @var resource
+     */
+    public $fd = null;
+    /**
+     * Byte offset into bz2 file
+     * @var int
+     */
+    public $file_offset = 0;
+    /**
+     * Since block sizes are not constant used to store sufficiently many
+     * bytes so can properly extract next blocks
+     * @var string
+     */
+    public $buffer = '';
+    /**
+     * Used to build and store a bz2 block from the file stream
+     * @var string
+     */
+    public $block = '';
+    /**
+     * Stores the left over bits of a bz2 block
+     * @var int
+     */
+    public $bits = 0;
+    /**
+     * Store how many left-over bits there are
+     * @var int
+     */
+    public $num_extra_bits = 0;
+    /**
+     * Lookup table fpr the number of bits by which the magic
+     * number for the next block has been shifted right. Second
+     * components of sub-arrays say whether block header or endmark
+     * @var array
+     */
+    public static $header_info = [
+        "\x41" => [0,  true], "\xa0" => [1,  true],
+        "\x50" => [2,  true], "\x28" => [3,  true],
+        "\x14" => [4,  true], "\x8a" => [5,  true],
+        "\xc5" => [6,  true], "\x62" => [7,  true],
+
+        "\x72" => [0, false], "\xb9" => [1, false],
+        "\xdc" => [2, false], "\xee" => [3, false],
+        "\x77" => [4, false], "\xbb" => [5, false],
+        "\x5d" => [6, false], "\x2e" => [7, false]
+    ];
+    /** String to tell if file is a bz2 file*/
+    const MAGIC = 'BZh';
+    /** String at the start of each bz2 block */
+    const BLOCK_HEADER = "\x31\x41\x59\x26\x53\x59";
+    /** String at the end of each bz2 block*/
+    const BLOCK_ENDMARK = "\x17\x72\x45\x38\x50\x90";
+    /**
+     * Blocks are NOT byte-aligned, so the block header (and endmark) may show
+     * up shifted right by 0-8 bits in various places throughout the file. This
+     * regular expression matches any of the possible shifts for both the block
+     * header and the block endmark.
+     */
+    const BLOCK_LEADER_RE = '
+        /
+         \x41\x59\x26\x53\x59 | \xa0\xac\x93\x29\xac | \x50\x56\x49\x94\xd6
+        |\x28\x2b\x24\xca\x6b | \x14\x15\x92\x65\x35 | \x8a\x0a\xc9\x32\x9a
+        |\xc5\x05\x64\x99\x4d | \x62\x82\xb2\x4c\xa6
+
+        |\x72\x45\x38\x50\x90 | \xb9\x22\x9c\x28\x48 | \xdc\x91\x4e\x14\x24
+        |\xee\x48\xa7\x0a\x12 | \x77\x24\x53\x85\x09 | \xbb\x92\x29\xc2\x84
+        |\x5d\xc9\x14\xe1\x42 | \x2e\xe4\x8a\x70\xa1
+        /x';
+    /**
+     * How many bytes to read into buffer from bz2 stream in one go
+     */
+    const BLOCK_SIZE = 8192;
+    /**
+     * Creates a new iterator of a bz2 file by opening the file, doing a
+     * sanity check and then setting up the initial file_offset to
+     * where the data starts
+     * @param string $path file path of bz2 file
+     */
+    public function __construct($path)
+    {
+        $this->path = $path;
+        $this->fd = fopen($this->path, 'rb');
+        $this->header = fread($this->fd, 4);
+        if (substr($this->header, 0, 3) != self::MAGIC) {
+            throw new \Exception('Bad bz2 magic number. Not a bz2 file?');
+        }
+        $this->block = fread($this->fd, 6);
+        if ($this->block != self::BLOCK_HEADER) {
+            throw new \Exception('Bad bz2 block header');
+        }
+        $this->file_offset = 10;
+    }
+    /**
+     * Called by unserialize prior to execution
+     */
+    public function __wakeup()
+    {
+        $this->fd = fopen($this->path, 'rb');
+        fseek($this->fd, $this->file_offset);
+    }
+    /**
+     * Checks whether the current Bzip2 file has reached an end of file
+     * @return bool eof or not
+     */
+    public function eof()
+    {
+        return feof($this->fd);
+    }
+    /**
+     * Used to close the file associated with this iterator
+     * @return bool whether the file close was successful
+     */
+    public function close()
+    {
+        return fclose($this->fd);
+    }
+    /**
+     * Extracts the next bz2 block from the bzip2 file this iterator works
+     * on
+     * @param bool $raw if false then decompress the recovered block
+     */
+    public function nextBlock($raw = false)
+    {
+        $recovered_block = null;
+        while(!feof($this->fd)) {
+            $next_chunk = fread($this->fd, self::BLOCK_SIZE);
+            $this->file_offset += strlen($next_chunk);
+            $this->buffer .= $next_chunk;
+            $match = preg_match( self::BLOCK_LEADER_RE, $this->buffer,
+                $matches, PREG_OFFSET_CAPTURE);
+            if ($match) {
+                /*
+                    $pos is the position of the SECOND byte of the magic number
+                    (plus some part of the first byte for a non-zero new_shift).
+                 */
+                $pos = $matches[0][1];
+                /*
+                     The new_shift is the number of bits by which the magic
+                      number for the next block has been shifted right.
+                 */
+                list($new_shift, $is_start) =
+                    self::$header_info[$this->buffer[$pos]];
+                /*
+                    The new number of extra bits is what's left in a byte after
+                    the new shift. For example, if we have 10|001011 as the byte
+                    that begins the next block's header, where the vertical bar
+                    represents the beginning of the header bits, the new shift
+                    is 2, and after we byte-align the new header to the left
+                    there will always be 6 extra bits waiting for two bits to
+                    form a byte to be added to the next block.
+                */
+                $new_num_extra_bits = $new_shift == 0 ? 0 : 8 - $new_shift;
+                if ($new_shift == 0) {
+                    $tail_bits = $new_bits = 0;
+                    $header_end = 5;
+                    $new_header = substr($this->buffer, $pos - 1, 6);
+                    $new_block = $new_header;
+                } else {
+                    $byte = ord($this->buffer[$pos-1]);
+                    $tail_bits = $byte & (((0x1 << $new_shift) - 1) <<
+                        (8 - $new_shift));
+                    $new_bits = ($byte << $new_shift) & 0xff;
+                    $header_end = 6;
+                    $new_block = '';
+                    $new_header = substr($this->buffer, $pos, 6);
+                    self::packLeft($new_block, $new_bits, $new_header,
+                        $new_num_extra_bits);
+                }
+                // Make sure all six header bytes match.
+                if ($is_start && $new_block != self::BLOCK_HEADER ||
+                        !$is_start && $new_block != self::BLOCK_ENDMARK) {
+                    $unmatched = substr($this->buffer, 0, $pos + 6);
+                    $keep = substr($this->buffer, $pos + 6);
+                    self::packLeft($this->block, $this->bits, $unmatched,
+                        $this->num_extra_bits);
+                    continue;
+                }
+                /*
+                    Copy and shift the last chunk of bytes from the previous
+                    block before adding the block trailer.
+                */
+                $block_tail = substr($this->buffer, 0, $pos - 1);
+                $this->packLeft($this->block, $this->bits, $block_tail,
+                    $this->num_extra_bits);
+                /*
+                    We need to combine the non-header tail bits from the most
+                    significant end of the last byte before the next block's
+                    header with whatever extra bits are left over from shifting
+                    the body of the previous block.
+                */
+                $bits_left = 8 - $this->num_extra_bits;
+                if ($new_shift >= $bits_left) {
+                    $this->bits |= ($tail_bits >> $this->num_extra_bits);
+                    $this->block .= chr($this->bits);
+                    $this->bits = ($tail_bits << $bits_left) & 0xff;
+                    $this->num_extra_bits = $new_shift - $bits_left;
+                } else {
+                    $this->bits |= ($tail_bits >> $this->num_extra_bits);
+                    $this->num_extra_bits = $this->num_extra_bits +
+                        $new_shift;
+                }
+                /*
+                    The last block is marked by a different header (sqrt(pi)),
+                    and a CRC for the entire "file", which is just the CRC for
+                    the first block, since there's only one block.
+                */
+                $trailer = "\x17\x72\x45\x38\x50\x90".
+                    substr($this->block, 6, 4);
+                $this->packLeft($this->block, $this->bits, $trailer,
+                    $this->num_extra_bits);
+                if ($this->num_extra_bits != 0) {
+                    $this->block .= chr($this->bits);
+                }
+                $recovered_block = $this->header.$this->block;
+                $this->block = $new_block;
+                /*
+                    Keep everything after the end of the header for the next
+                    block in the buffer.
+                */
+                $this->buffer = substr($this->buffer, $pos + $header_end);
+                $this->bits = $new_bits;
+                $this->num_extra_bits = $new_num_extra_bits;
+                break;
+            } else {
+                /*
+                    No match, but we may have just missed a header by a byte, so
+                    we need to keep the last six bytes in the buffer so that we
+                    have a chance to get the full header on the next round.
+                */
+                $unmatched = substr($this->buffer, 0, -6);
+                $this->packLeft($this->block, $this->bits, $unmatched,
+                    $this->num_extra_bits);
+                $this->buffer = substr($this->buffer, -6);
+            }
+        }
+        if (!$raw) {
+            return bzdecompress($recovered_block);
+        } else {
+            return $recovered_block;
+        }
+    }
+    /**
+     * Computes a new bzip2 block portions and bits left over after adding
+     * $bytes to the passed $block.
+     *
+     * @param string& $block the block to add to
+     * @param int& $bits used to hold bits left over
+     * @param string $bytes what to add to the bzip block
+     * @param int $num_extra_bits how many extra bits there are
+     */
+    public function packLeft(&$block, &$bits, $bytes, $num_extra_bits)
+    {
+        if ($num_extra_bits == 0) {
+            $block .= $bytes;
+            return;
+        }
+        $num_bytes = strlen($bytes);
+        for ($i = 0; $i < $num_bytes; $i++) {
+            $byte = ord($bytes[$i]);
+            $bits |= ($byte >> $num_extra_bits);
+            $block .= chr($bits);
+            $bits = ($byte << (8 - $num_extra_bits)) & 0xff;
+        }
+    }
+}
+if (!function_exists("main") && php_sapi_name() == 'cli') {
+    /**
+     * Command-line shell for testing the class
+     */
+    function main()
+    {
+        global $argv;
+        $path = $argv[1];
+        $prefix = isset($argv[2]) ? $argv[2] : 'rec';
+        $itr = new BZip2BlockIterator($path);
+        $i = 1;
+        while(($block = $itr->next_block(true)) !== null) {
+            $rec_name = sprintf("%s%05d.bz2", $prefix, $i);
+            file_put_contents($rec_name, $block);
+            echo "Recovered block {$i}\n";
+            $i++;
+        }
+    }
+    // Only run main if this script is called directly from the command line.
+    if (isset($argv[0]) && realpath($argv[0]) == __FILE__) {
+        main();
+    }
+}
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@ -0,0 +1,238 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * Shared constants and enums used by components that are involved in the
+ * crawling process
+ *
+ * @author Chris Pollett
+ */
+interface CrawlConstants
+{
+    /**
+     * Used to say what kind of queue_server this is
+     */
+    const BOTH = "IndexerAndScheduler";
+    /**
+     * Used to say what kind of queue_server this is
+     */
+    const INDEXER = "Indexer";
+    /**
+     * Used to say what kind of queue_server this is
+     */
+    const SCHEDULER = "Scheduler";
+    const queue_base_name = "QueueBundle";
+    const archive_base_name = "Archive";
+    const name_archive_iterator = "NameArchiveIterator";
+    const fetch_archive_iterator = "FetchArchiveIterator";
+    const save_point = "SavePoint";
+    const schedule_data_base_name = "ScheduleData";
+    const schedule_name = "FetchSchedule";
+    const robot_data_base_name = "RobotData";
+    const etag_expires_data_base_name = "EtagExpiresData";
+    const index_data_base_name = "IndexData";
+    const network_base_name = "Network";
+    const network_crawllist_base_name = "NetworkCrawlList";
+    const statistics_base_name = "Statistics";
+    const index_closed_name = "IndexClosed";
+    const fetch_batch_name = "FetchBatch";
+    const fetch_crawl_info = "FetchInfo";
+    const fetch_closed_name = "FetchClosed";
+    const data_base_name = "At";
+    const schedule_start_name = "StartCrawlSchedule.txt";
+    const robot_table_name = "robot_table.txt";
+    const mirror_table_name = "mirror_table.txt";
+    /** Used in priority queue*/
+    const MAX = 1;
+    const MIN = -1;
+    /** starts of daemon processes*/
+    const STOP_STATE = -1;
+    const CONTINUE_STATE = 1;
+    const NO_DATA_STATE = 2;
+    const WAITING_START_MESSAGE_STATE = 3;
+    const REDO_STATE = 4;
+    const STATUS = 'a';
+    const CRAWL_TIME = 'b';
+    const HTTP_CODE = 'c';
+    const TIMESTAMP = 'd';
+    const TYPE = 'e';
+    const ENCODING = 'f';
+    const SEEN_URLS = 'g';
+    const MACHINE = 'h';
+    const INVERTED_INDEX = 'i';
+    const SAVED_CRAWL_TIMES= 'j';
+    const SCHEDULE_TIME = 'k';
+    const URL = 'l';
+    const WEIGHT = 'm';
+    const ROBOT_PATHS = 'n';
+    const HASH = 'o';
+    const PAGE = 'q';
+    const DOC_INFO = 'r';
+    const TITLE = 's';
+    const DESCRIPTION = 't';
+    const THUMB = 'u';
+    const CRAWL_DELAY = 'v';
+    const LINKS = 'w';
+    const ROBOT_TXT = 'x';
+    const TO_CRAWL = 'y';
+    const INDEX = 'z';
+    const AVERAGE_TITLE_LENGTH = 'A';
+    const AVERAGE_DESCRIPTION_LENGTH = 'B';
+    const AVERAGE_TOTAL_LINK_TEXT_LENGTH = 'C';
+    const TITLE_LENGTH = 'D';
+    const DESCRIPTION_LENGTH = 'E';
+    const LINK_LENGTH = 'F';
+    const TITLE_WORDS = 'G';
+    const DESCRIPTION_WORDS = 'H';
+    const LINK_WORDS = 'I';
+    const TITLE_WORD_SCORE = 'J';
+    const DESCRIPTION_WORD_SCORE = 'K';
+    const LINK_WORD_SCORE = 'L';
+    const DOC_DEPTH = 'M';
+    const DOC_RANK = 'N';
+    const URL_WEIGHT = 'O';
+    const INLINKS = 'P';
+    const NEW_CRAWL = 'Q';
+    const OFFSET = 'R';
+    const PATHS = 'S';
+    const HASH_URL = 'T';
+    const SUMMARY_OFFSET = 'U';
+    const DUMMY = 'V';
+    const SITES = 'W';
+    const SCORE = 'X';
+    const CRAWL_ORDER = 'Y';
+    const RESTRICT_SITES_BY_URL = 'Z';
+    const ALLOWED_SITES = 'aa';
+    const DISALLOWED_SITES = 'ab';
+    const BREADTH_FIRST = 'ac';
+    const PAGE_IMPORTANCE = 'ad';
+    const MACHINE_URI = 'ae';
+    const SITE_INFO = 'af';
+    const FILETYPE = 'ag';
+    const SUMMARY = 'ah';
+    const URL_INFO = 'ai';
+    const HASH_SEEN_URLS ='aj';
+    const RECENT_URLS ='ak';
+    const MEMORY_USAGE ='al';
+    const DOC_ID ='am';
+    const RELEVANCE ='an';
+    const PAGE_RULES ='ao';
+    const CACHE_PAGE_PARTITION = 'ap';
+    const GENERATION = 'aq';
+    const HASH_SUM_SCORE = 'ar';
+    const HASH_URL_COUNT = 'as'; //not used
+    const IS_DOC = 'at';
+    const BOOST = 'av';
+    const IP_ADDRESSES = 'au';
+    const JUST_METAS = 'aw';
+    const WEB_CRAWL = 'ax';
+    const ARCHIVE_CRAWL = 'ay';
+    const CRAWL_TYPE = 'az';
+    const CRAWL_INDEX = 'ba';
+    const HEADER = 'bb';
+    const SERVER = 'bc';
+    const SERVER_VERSION = 'bd';
+    const OPERATING_SYSTEM = 'be';
+    const MODIFIED = 'bf';
+    const LANG = 'bg';
+    const ROBOT_INSTANCE = 'bh';
+    const DOC_LEN = 'bi';
+    const SUBDOCS = 'bj';
+    const SUBDOCTYPE = 'bk';
+    const INDEXING_PLUGINS = 'bl';
+    const DOMAIN_WEIGHTS = 'bm';
+    const POSITION_LIST = 'bn';
+    const PROXIMITY = 'bo';
+    const LOCATION = 'bp';
+    const INDEXED_FILE_TYPES = 'bq';
+    const PAGE_RANGE_REQUEST = 'br';
+    const PAGE_RECRAWL_FREQUENCY = 'bs';
+    const DATA = 'bt';
+    const QUEUE_SERVERS = "bu";
+    const CURRENT_SERVER = "bv";
+    const SIZE = "bw";
+    const TOTAL_TIME = "bx";
+    const DNS_TIME = "by";
+    const AGENT_LIST = "bz";
+    const ROBOT_METAS = "ca";
+    const ARC_DIR = "cb";
+    const ARC_TYPE = "cc";
+    const ARC_DATA = "cd";
+    const KEY = "ce";
+    const MACHINE_ID = 'cf';
+    const VIDEO_SOURCES = 'cg';
+    const IS_FEED = 'ch';
+    const SOURCE_NAME = 'ci';
+    const LINK_SEEN_URLS = 'cj';
+    const POST_MAX_SIZE = 'ck';
+    const LOGGING = 'cl';
+    const META_WORDS = 'cm';
+    const CACHE_PAGES = 'cn';
+    const WARC_ID = 'co';
+    const START_PARTITION = 'cp';
+    const INI = 'cq';
+    const UI_FLAGS = 'cr';
+    const KEYWORD_LINKS = 'cs';
+    const END_ITERATOR = 'ct';
+    const ACTIVE_CLASSIFIERS = 'cu';
+    const ACTIVE_CLASSIFIERS_DATA = 'cv';
+    const MAX_DESCRIPTION_LEN = 'cw';
+    const CACHE_PAGE_VALIDATORS = 'cx';
+    const CACHE_PAGE_VALIDATION_DATA = 'cy';
+    const NUM_PARTITIONS = 'cz';
+    const PARTITION_NUM = 'da';
+    const ACTIVE_RANKERS = 'db';
+    const USER_RANKS = "dc";
+    const INDEXING_PLUGINS_DATA = "dd";
+    const REPOSITORY_TYPE = 'de';
+    const FILE_NAME = 'df';
+    const SHA_HASH = 'dg';
+    const TOR_PROXY = 'dh';
+    const PROXY_SERVERS = 'di';
+    const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;
+    const BASIC_SUMMARIZER = 'dk';
+    const CENTROID_SUMMARIZER = 'dl';
+    const SUMMARIZER_OPTION = 'dm';
+    const WORD_CLOUD = 'dn';
+    const THESAURUS_SCORE ='do';
+    const IS_GOPHER_URL = "dp";
+    const MINIMUM_FETCH_LOOP_TIME = "dq";
+    const IMAGE_LINK = "dr";
+    const GRAPH_BASED_SUMMARIZER = 'ds';
+    const CENTROID_WEIGHTED_SUMMARIZER = 'dt';
+    const SCRAPER_LABEL = 'du';
+    const SCRAPERS = 'dv';
+    const IS_NEWS = "dw";
+    const QUESTION_ANSWERS = 'dx';
+    const CONTENT_SIZE = 'dy';
+    const NO_RANGE = 'dz';
+}
--- a/src/library/CrawlDaemon.php
+++ b/src/library/CrawlDaemon.php
@ -0,0 +1,371 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\CrawlConstants;
+
+/**
+ * Load the crawlLog function
+ */
+require_once C\BASE_DIR."/library/Utility.php";
+/**
+ * Used to run scripts as a daemon on *nix systems
+ *
+ * @author Chris Pollett
+ */
+class CrawlDaemon implements CrawlConstants
+{
+    /**
+     * Name prefix to be used on files associated with this daemon
+     * (such as lock like and messages)
+     * @var string
+     * @static
+     */
+    public static $name;
+    /**
+     * Subname of the name prefix used on files associated with this daemon
+     * For example, the name might be fetcher, the subname might 2 to indicate
+     * which fetcher daemon instance.
+     *
+     * @var string
+     * @static
+     */
+    public static $subname;
+    /**
+     * Used by processHandler to decide whether run as daemon or not
+     * @var string
+     * @static
+     */
+    public static $mode;
+    /**
+     * Tick callback function used to update the timestamp in this processes
+     * lock. If lock_file does not exist or more than PROCESS_TIMEOUT
+     * time has elapsed since the last processHandler call it stops the process
+     *
+     * @param bool $continue if true only stop if lock file not present,
+     *   ignore PROCESS_TIMEOUT time being exceeded.
+     */
+    public static function processHandler($continue = false)
+    {
+        static $time = 0;
+        if (self::$mode != 'daemon') {
+            return true;
+        }
+        $lock_file = CrawlDaemon::getLockFileName(self::$name, self::$subname);
+        $now = time();
+        if ($time == 0 ) {
+            $time = $now;
+        }
+        $lock_exist = file_exists($lock_file);
+        if (!$lock_exist || ($now - $time) > C\PROCESS_TIMEOUT) {
+            $name_string = CrawlDaemon::getNameString(self::$name,
+                self::$subname);
+            if (($now - $time) > C\PROCESS_TIMEOUT) {
+                crawlLog($name_string.": ".($now - $time) .
+                    " seconds has elapsed since processHandler last called.",
+                    null, true);
+                crawlLog("Timeout exceeded...", null, true);
+            }
+            if (!$lock_exist || !$continue) {
+                crawlLog("Stopping $name_string ...", null, true);
+                exit();
+            }
+        }
+        $time = $now;
+        file_put_contents($lock_file, $now);
+        return true;
+    }
+    /**
+     * Used to send a message the given daemon or run the program in the
+     * foreground.
+     *
+     * @param array $argv an array of command line arguments. The argument
+     *     start will check if the process control functions exists if these
+     *     do they will fork and detach a child process to act as a daemon.
+     *     a lock file will be created to prevent additional daemons from
+     *     running. If the message is stop then a message file is written to
+     *     tell the daemon to stop. If the argument is terminal then the
+     *     program won't be run as a daemon.
+     * @param string $name the prefix to use for lock and message files
+     * @param int $exit_type whether this function should exit > 0 or return (1)
+     *     by default a lock file is only written if exit (this allows
+     *     both queue server processes (Indexer and Scheduler) to use the
+     *     same lock file. If exit is >=3 or <= -3 then doesn't check lock
+     *     to see if already running before starting
+     */
+    public static function init($argv, $name, $exit_type = 1)
+    {
+        self::$name = $name;
+        if (isset($argv[2]) && $argv[2] != "none") {
+            self::$subname = $argv[2];
+        } else {
+            self::$subname = "";
+        }
+        //don't let our script be run from apache
+        if (isset($_SERVER['DOCUMENT_ROOT']) &&
+            strlen($_SERVER['DOCUMENT_ROOT']) > 0) {
+            echo "BAD REQUEST";
+            exit();
+        }
+        if (!isset($argv[1])) {
+            echo "$name needs to be run with a command-line argument.\n";
+            echo "For example,\n";
+            echo "php $name.php start //starts the $name as a daemon\n";
+            echo "php $name.php stop //stops the $name daemon\n";
+            echo "php $name.php terminal //runs $name within the current ".
+                "process, not as a daemon, output going to the terminal\n";
+            exit();
+        }
+        $messages_file = self::getMesssageFileName(self::$name, self::$subname);
+        switch ($argv[1]) {
+            case "start":
+                $options = "";
+                for ($i = 3; $i < count($argv); $i++) {
+                    $options .= " ".$argv[$i];
+                }
+                $subname = (!isset($argv[2]) || $argv[2] == 'none') ?
+                    'none' :self::$subname;
+                $name_prefix = (isset($argv[3])) ? $argv[3] : self::$subname;
+                $name_string = CrawlDaemon::getNameString($name, $name_prefix);
+                echo "Starting $name_string...\n";
+                CrawlDaemon::start($name, $subname, $options, $exit_type);
+                break;
+            case "stop":
+                CrawlDaemon::stop($name, self::$subname);
+                break;
+            case "terminal":
+                self::$mode = 'terminal';
+                $info = [];
+                $info[self::STATUS] = self::WAITING_START_MESSAGE_STATE;
+                file_put_contents($messages_file, serialize($info));
+                chmod($messages_file, 0777);
+                C\nsdefine("LOG_TO_FILES", false);
+                break;
+            case "child":
+                self::$mode = 'daemon';
+                $info = [];
+                $info[self::STATUS] = self::WAITING_START_MESSAGE_STATE;
+                file_put_contents($messages_file, serialize($info));
+                chmod($messages_file, 0777);
+                C\nsdefine("LOG_TO_FILES", true);
+                    // if false log messages are sent to the console
+                break;
+            default:
+                exit();
+        }
+    }
+    /**
+     * Used to start a daemon running in the background
+     *
+     * @param string $name the main name of this daemon such as queue_server
+     *     or fetcher.
+     * @param string $subname the instance name if it is possible for more
+     *     than one copy of the daemon to be running at the same time
+     * @param string $options a string of additional command line options
+     * @param int $exit whether this function should exit > 0 or return (1)
+     *     by default a lock file is only written if exit (this allows
+     *     both queue server processes (Indexer and Scheduler) to use the
+     *     same lock file. If exit is >=3 or <= -3 then doesn't check lock
+     *     to see if already running before starting
+     */
+    public static function start($name, $subname = "", $options = "", $exit = 1)
+    {
+        $tmp_subname = ($subname == 'none') ? '' : $subname;
+        $lock_file = CrawlDaemon::getLockFileName($name, $tmp_subname);
+        if (file_exists($lock_file) && ($exit < 3 && $exit > -3)) {
+            $time = intval(file_get_contents($lock_file));
+            if (time() - $time < C\PROCESS_TIMEOUT) {
+                echo "$name appears to be already running...\n";
+                echo "Try stopping it first, then running start.";
+                exit();
+            }
+        }
+        $php = "php";
+        if (C\nsdefined("PHP_PATH") ) {
+            $php = C\PHP_PATH."/".$php;
+        }
+        /* make sure hhvm has write access to the folder
+           of the owner of the webserver process so it can write
+           a .hhvm.hhbc file
+         */
+        if (function_exists("posix_getpwuid")) {
+            $process_user_info = posix_getpwuid(posix_getuid());
+            $process_home = $process_user_info['dir'];
+            if (C\nsdefined("FORCE_HHVM") || (
+                stristr(phpversion(), "hhvm") !==false &&
+                posix_access($process_home, POSIX_W_OK))) {
+                $php = 'hhvm -f ';
+                if (C\nsdefined("HHVM_PATH") ) {
+                    $php = C\HHVM_PATH."/".$php;
+                }
+            }
+        }
+        if (strstr(PHP_OS, "WIN")) {
+            $base_dir = str_replace("/", "\\", C\BASE_DIR);
+            $script = "start /B $php ".
+                $base_dir."\\executables\\$name.php child %s";
+        } else {
+            $script = "$php '".
+                C\BASE_DIR."/executables/$name.php' child %s < /dev/null ".
+                " > /dev/null &";
+        }
+        $total_options = "$subname $options";
+        $at_job = sprintf($script, $total_options);
+        pclose(popen($at_job, "r"));
+        if ($exit != 0) {
+            file_put_contents($lock_file,  time());
+        }
+        if ($exit > 0) {
+            exit();
+        }
+    }
+    /**
+     * Used to execute a shell command in its own process
+     *
+     * @param string $cmd the command to execute
+     */
+    public static function execInOwnProcess($cmd)
+    {
+        if (strstr(PHP_OS, "WIN")) {
+            $job = "start /B $cmd ";
+        } else {
+            $job = "$cmd < /dev/null > /dev/null &";
+        }
+        pclose(popen($job, "r"));
+    }
+    /**
+     * Used to stop a daemon that is running in the background
+     *
+     * @param string $name the main name of this daemon such as queue_server
+     *     or fetcher.
+     * @param string $subname the instance name if it is possible for more
+     *     than one copy of the daemon to be running at the same time
+     * @param bool $exit whether this method should just return (false) or
+     *      call exit() (true)
+     */
+    public static function stop($name, $subname = "", $exit = true)
+    {
+        $name_string = CrawlDaemon::getNameString($name, $subname);
+        $lock_file = CrawlDaemon::getLockFileName($name, $subname);
+        $not_web_setting = (php_sapi_name() == 'cli');
+        if (file_exists($lock_file)) {
+            unlink($lock_file);
+            if ($not_web_setting) {
+                crawlLog("Sending stop signal to $name_string...");
+            }
+        } else if ($not_web_setting) {
+            crawlLog("$name_string does not appear to running...");
+        }
+        if ($exit) {
+            exit();
+        }
+    }
+    /**
+     * Used to return the string name of the messages file used to pass
+     * messages to a daemon running in the background
+     *
+     * @param string $name the main name of this daemon such as queue_server
+     *     or fetcher.
+     * @param string $subname the instance name if it is possible for more
+     *     than one copy of the daemon to be running at the same time
+     *
+     * @return string the name of the message file for the daemon with
+     *     the given name and subname
+     */
+    public static function getMesssageFileName($name, $subname = "")
+    {
+        return C\CRAWL_DIR."/schedules/".self::getNameString($name, $subname)
+            . "Messages.txt";
+    }
+    /**
+     * Used to return the string name of the lock file used to pass
+     * by a daemon
+     *
+     * @param string $name the main name of this daemon such as queue_server
+     *     or fetcher.
+     * @param string $subname the instance name if it is possible for more
+     *     than one copy of the daemon to be running at the same time
+     *
+     * @return string the name of the lock file for the daemon with
+     *     the given name and subname
+     */
+    public static function getLockFileName($name, $subname = "")
+    {
+        return C\CRAWL_DIR."/schedules/".self::getNameString($name, $subname)
+            . "Lock.txt";
+    }
+    /**
+     * Used to return a string name for a given daemon instance
+     *
+     * @param string $name the main name of this daemon such as queue_server
+     *     or fetcher.
+     * @param string $subname the instance name if it is possible for more
+     *     than one copy of the daemon to be running at the same time
+     *
+     * @return string a single name that combines the name and subname
+     */
+    public static function getNameString($name, $subname)
+    {
+        return ($subname == "") ? $name : $subname."-".$name;
+    }
+    /**
+     * Returns the statuses of the running daemons
+     *
+     * @return array 2d array active_daemons[name][instance] = true
+     */
+    public static function statuses()
+    {
+        $prefix = C\CRAWL_DIR . "/schedules/";
+        $prefix_len = strlen($prefix);
+        $suffix = "Lock.txt";
+        $suffix_len = strlen($suffix);
+        $lock_files = "$prefix*$suffix";
+        clearstatcache();
+        $time = time();
+        $active_daemons = [];
+        foreach (glob($lock_files) as $file) {
+            if ($time - filemtime($file)  < C\PROCESS_TIMEOUT) {
+                $len = strlen($file) - $suffix_len - $prefix_len;
+                $pre_name = substr($file, $prefix_len, $len);
+                $pre_name_parts = explode("-", $pre_name);
+                if (count($pre_name_parts) == 1) {
+                    $active_daemons[$pre_name][-1] = 1;
+                } else {
+                    $first = array_shift($pre_name_parts);
+                    $rest = implode("-", $pre_name_parts);
+                    $active_daemons[$rest][$first] = true;
+                }
+            }
+        }
+        return $active_daemons;
+    }
+}
--- a/src/library/FetchGitRepositoryUrls.php
+++ b/src/library/FetchGitRepositoryUrls.php
@ -0,0 +1,622 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Snigdha Rao Parvatneni
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Library of functions used to fetch Git internal urls
+ *
+ * @author Chris Pollett
+ */
+class FetchGitRepositoryUrls implements CrawlConstants
+{
+    /**
+     * A list of meta words that might be extracted from a query
+     * @var array
+     */
+    public static $repository_types = ['git' => 'git', 'svn' => 'svn',
+        'cvs' => 'cvs', 'vss' => 'vss', 'mercurial' => 'mercurial',
+        'monotone' => 'monotone', 'bazaar' => 'bazaar', 'darcs' => 'darcs',
+        'arch' => 'arch'];
+    /**
+     * An array used to store all the Git internal urls
+     * @var array
+     */
+    public static $all_git_urls;
+    /**
+     * An indicator to tell no actions to be taken
+     */
+    const INDICATOR_NONE = 'none';
+    /**
+     * An indicator to indicate git repository
+     */
+    const INDICATOR_GIT = 'git';
+    /**
+     * An indicator to tell more git urls need to be fetched
+     */
+    const GIT_URL_CONTINUE = '@@@@';
+    /**
+     * An indicator to tell starting position of Git url to be used
+     */
+    const GIT_BASE_URL_START = 0;
+    /**
+     * An indicator to tell ending position of Git url to be used
+     */
+    const GIT_BASE_URL_END = '###';
+    /**
+     * A fixed component to be used with Git base url to form Git first url
+     */
+    const GIT_URL_EXTENSION = 'info/refs?service=git-upload-pack';
+    /**
+     * A fixed component to be used with Git urls to get next Git urls
+     */
+    const GIT_URL_OBJECT = 'objects/';
+    /**
+     * A fixed indicator used to get last letter of git base url
+     */
+    const GIT_BASE_URL_END_POSITION = -1;
+    /**
+     * A fixed indicator used to get last letter of git base url
+     */
+    const GIT_BASE_END_LETTER = 1;
+    /**
+     * A fixed position used to indicate starting point to fetch next Git url
+     * from the master file
+     */
+    const GIT_NEXT_URL_START = 0;
+    /**
+     * A fixed position used to indicate ending position to fetch next Git url
+     * from the master file
+     */
+    const GIT_NEXT_URL_END = 40;
+    /**
+     * A fixed indicator used to make desired Git folder structure from SHA hash
+     */
+    const GIT_URL_SPLIT = '/';
+    /**
+     * A fixed indicator used to mark starting position of SHA hash of Git
+     * master tree
+     */
+    const GIT_MASTER_TREE_HASH_START = 16;
+    /**
+     * A fixed indicator used to mark ending position of SHA hash of Git
+     * master tree
+     */
+    const GIT_MASTER_TREE_HASH_END = 41;
+    /**
+     * A fixed indicator used to mark starting position of SHA hash used to
+     * indicate Git object folder
+     */
+    const GIT_FOLDER_NAME_START = 0;
+    /**
+     * A fixed indicator used to mark ending position of SHA hash used to
+     * indicate Git object folder
+     */
+    const GIT_FOLDER_NAME_END = 2;
+    /**
+     * A fixed indicator used to mark starting position of SHA hash used to
+     * indicate Git object file
+     */
+    const GIT_FILE_NAME_START = 2;
+    /**
+     * A fixed indicator used to mark ending position of SHA hash used to
+     * indicate Git object file
+     */
+    const GIT_FILE_NAME_END = 38;
+    /**
+     * A fixed indicator used to indicate Git blob object
+     */
+     const GIT_BLOB_OBJECT = "blob";
+    /**
+     * A fixed indicator used to indicate Git tree object
+     */
+     const GIT_TREE_OBJECT = "tree";
+    /**
+     * A cURL time out parameter
+     */
+     const CURL_TIMEOUT = 5;
+    /**
+     * A cURL transfer parameter
+     */
+     const CURL_TRANSFER = 1;
+    /**
+     * Git blob access code starting position
+     */
+     const BLOB_ACCESS_CODE_START = 0;
+    /**
+     * Git blob access code ending position
+     */
+     const BLOB_ACCESS_CODE_END = 6;
+    /**
+     * Git tree access code starting position
+     */
+     const TREE_ACCESS_CODE_START = 0;
+    /**
+     * Git tree access code ending position
+     */
+     const TREE_ACCESS_CODE_END = 5;
+    /**
+     * Git SHA hash binary starting position
+     */
+     const SHA_HASH_BINARY_START = 0;
+    /**
+     * Git SHA hash binary ending position
+     */
+     const SHA_HASH_BINARY_END = 20;
+    /**
+     * A indicator for starting of Git file or folder name
+     */
+     const GIT_NAME_START = 0;
+    /**
+     * A indicator to represent next position after the access code in Git
+     * blob object
+     */
+     const GIT_BLOB_NEXT = 7;
+    /**
+     * A indicator to represent next position after the access code in Git
+     * tree object
+     */
+     const GIT_TREE_NEXT = 6;
+    /**
+     * A indicator to represent next position after the access code in Git
+     * tree object
+     */
+     const HEX_NULL_CHARACTER = "\x00";
+    /**
+     * A indicator to represent that a git file is a blob file
+     */
+    const GIT_BLOB_INDICATOR = '100';
+    /**
+     * A indicator to represent that a git file is a tree file
+     */
+    const GIT_TREE_INDICATOR = '400';
+    /**
+     * Checks repository type based on extension
+     *
+     * @param string $extension to check
+     * @return string $repository_type repository type based on the
+     * extension of urls
+     */
+    public static function checkForRepository($extension)
+    {
+        if (isset(self::$repository_types[$extension])) {
+            $repository_type = self::$repository_types[$extension];
+        } else {
+            $repository_type = self::INDICATOR_NONE;
+        }
+        return $repository_type;
+    }
+    /**
+     * Sets up the seed sites with urls from a git repository (updates
+     * these sites if have already started downloading from repository)
+     *
+     * @param string $url_to_check url needs to be processed
+     * @param int $counter to keep track of number of urls processed
+     * @param array $seeds store sites which are ready to be downloaded
+     * @param array $repository_indicator indicates the type of the repository
+     * @param array $site_pair contains original Git url crawled
+     * @param int $total_git_urls number of urls in repository less those
+     *      already processed
+     * @param array $all_git_urls current list of urls from git repository
+     * @return array $git_internal_urls containing all the internal Git urls
+     * fetched from the parent Git url
+     */
+    public static function setGitRepositoryUrl($url_to_check, $counter, $seeds,
+        $repository_indicator, $site_pair, $total_git_urls, $all_git_urls)
+    {
+        $git_internal_urls = [];
+        if (!strpos($url_to_check, self::GIT_URL_CONTINUE)) {
+            $git_next_urls = self::fetchGitRepositoryUrl($url_to_check);
+            $all_git_urls = $git_next_urls;
+            $total_git_urls = count($all_git_urls);
+            $count_all_git_urls = $total_git_urls;
+            if (intval(C\NUM_MULTI_CURL_PAGES) - $counter < $total_git_urls) {
+                $total_git_urls = intval(C\NUM_MULTI_CURL_PAGES) - $counter;
+            }
+            for ($j = 0; $j < $total_git_urls; $j++) {
+                $seeds[$counter][self::URL] = $git_next_urls[$j][2];
+                $seeds[$counter][self::WEIGHT] = $site_pair['value'][1];
+                $seeds[$counter][self::CRAWL_DELAY] = $site_pair['value'][2];
+                $seeds[$counter][self::REPOSITORY_TYPE] = $repository_indicator;
+                $seeds[$counter][self::FILE_NAME] = $git_next_urls[$j][0];
+                $seeds[$counter][self::SHA_HASH] = $git_next_urls[$j][1];
+                $counter++;
+                $git_url_index = $j + 1;
+                if ($git_url_index >= $count_all_git_urls) {
+                    $repository_indicator = self::INDICATOR_NONE;
+                } else {
+                    $repository_indicator = self::INDICATOR_GIT;
+                }
+            }
+            $counter--;
+        } else {
+            $position = strpos($url_to_check, self::GIT_URL_CONTINUE);
+            $extension_string = substr($url_to_check, $position,
+                strlen($url_to_check));
+            $extension_count = explode(self::GIT_URL_CONTINUE,
+                $extension_string);
+            $git_index = intval(array_sum($extension_count));
+            $url_to_check = substr($url_to_check, self::GIT_NEXT_URL_START,
+                $position);
+            $count_all_git_urls = $total_git_urls;
+            if (intval(C\NUM_MULTI_CURL_PAGES) - $counter < $total_git_urls -
+                $git_index) {
+                $total_git_urls = intval(C\NUM_MULTI_CURL_PAGES) - $counter;
+            } else {
+                $total_git_urls = $total_git_urls - $git_index;
+            }
+            for ($j = 0; $j < $total_git_urls; $j++) {
+                $seeds[$counter][self::URL] = $all_git_urls[$git_index][2];
+                $seeds[$counter][self::WEIGHT] = $site_pair['value'][1];
+                $seeds[$counter][self::CRAWL_DELAY] = $site_pair['value'][2];
+                $seeds[$counter][self::REPOSITORY_TYPE] = $repository_indicator;
+                $seeds[$counter][self::FILE_NAME] = $all_git_urls[$git_index]
+                    [0];
+                $seeds[$counter][self::SHA_HASH] = $all_git_urls[$git_index][1];
+                $counter++;
+                $git_index++;
+                $git_url_index = $j + 1;
+                if ($git_index >= $count_all_git_urls) {
+                    $repository_indicator = self::INDICATOR_NONE;
+                } else {
+                    $repository_indicator = self::INDICATOR_GIT;
+                }
+            }
+            $counter--;
+        }
+        $git_internal_urls['position'] = $counter;
+        $git_internal_urls['index'] = $git_url_index;
+        $git_internal_urls['seeds'] = $seeds;
+        $git_internal_urls['indicator'] = $repository_indicator;
+        $git_internal_urls['count'] = $count_all_git_urls;
+        $git_internal_urls['all'] = $all_git_urls;
+        return $git_internal_urls;
+    }
+    /**
+     * Get the Git internal urls from the parent Git url
+     *
+     * @param string $url_to_check url needs to be processed
+     * @return an array $git_next_urls consists of list of Git
+     *      internal urls wich are called during the git clone
+     */
+    public static function fetchGitRepositoryUrl($url_to_check)
+    {
+        $compression_indicator = false;
+        $position = strpos($url_to_check, self::GIT_BASE_URL_END);
+        $git_base_url = substr($url_to_check, self::GIT_BASE_URL_START,
+            $position);
+        $base_url_last_letter = substr($git_base_url,
+            self::GIT_BASE_URL_END_POSITION, self::GIT_BASE_END_LETTER);
+        if ($base_url_last_letter != self::GIT_URL_SPLIT) {
+            $git_base_url = $git_base_url . self::GIT_URL_SPLIT;
+        }
+        $git_first_url =  $git_base_url.self::GIT_URL_EXTENSION;
+        $git_first_url_content = self::getNextGitUrl($git_first_url,
+            $compression_indicator);
+        $compression_indicator = true;
+        $git_second_url = self::getGitMasterFile($git_first_url_content,
+            $git_base_url);
+        $git_second_url_content = self::getNextGitUrl($git_second_url,
+            $compression_indicator);
+        $git_third_url = self::getGitMasterTree($git_second_url_content,
+            $git_base_url);
+        $git_third_url_content = self::getNextGitUrl($git_third_url,
+            $compression_indicator);
+        $git_next_urls = self::getObjects($git_third_url_content,
+            $git_base_url);
+        return $git_next_urls;
+    }
+    /**
+     * Get the Git second url which points to Git master tree structure
+     *
+     * @param string $git_first_url_content contents of Git first url
+     * @param string $git_base_url common portion of Git urls
+     * @return string $git_next_url consists of second internal Git url
+     */
+    public static function getGitMasterFile($git_first_url_content,
+        $git_base_url)
+    {
+        $git_extended_url = substr($git_first_url_content,
+            self::GIT_NEXT_URL_START, self::GIT_NEXT_URL_END);
+        $first_split_git_extended_url = substr($git_extended_url,
+            self::GIT_FOLDER_NAME_START, self::GIT_FOLDER_NAME_END);
+        $second_split_git_extended_url = substr($git_extended_url,
+            self::GIT_FILE_NAME_START, self::GIT_FILE_NAME_END);
+        $git_url_connector = $first_split_git_extended_url .
+            self::GIT_URL_SPLIT . $second_split_git_extended_url;
+        $git_next_url = $git_base_url . self::GIT_URL_OBJECT .
+            $git_url_connector;
+        return $git_next_url;
+    }
+    /**
+     * Get the Git third url which contains the information about the
+     *    organization of entire git repository
+     *
+     * @param string $git_second_url_content contents of Git second url
+     * @param string $git_base_url common portion of git urls
+     * @return string $git_next_url consists of third internal git url
+     */
+    public static function getGitMasterTree($git_second_url_content,
+        $git_base_url)
+    {
+        $git_master_tree_hash = substr($git_second_url_content,
+            self::GIT_MASTER_TREE_HASH_START, self::GIT_MASTER_TREE_HASH_END);
+        $git_object_folder_name = substr($git_master_tree_hash,
+            self::GIT_FOLDER_NAME_START, self::GIT_FOLDER_NAME_END);
+        $git_object_file_name = substr($git_master_tree_hash,
+            self::GIT_FILE_NAME_START, self::GIT_FILE_NAME_END);
+        $git_object_path = $git_object_folder_name . self::GIT_URL_SPLIT .
+            $git_object_file_name;
+        $git_next_url = $git_base_url . self::GIT_URL_OBJECT . $git_object_path;
+        return $git_next_url;
+    }
+    /**
+     * Get the Git content from url which will be used to get the
+     *    next git url
+     *
+     * @param string $git_url git url to extract contents from it
+     * @param string $compression_indicator indicator for compress and
+     * uncompress contents
+     * @return string $git_object_content consists contents extracted from the
+     * url
+     */
+    public static function getNextGitUrl($git_url, $compression_indicator)
+    {
+        if (!$compression_indicator) {
+            $git_object_compress_content = self::getGitdata($git_url);
+            $git_object_content = $git_object_compress_content;
+        } else {
+            $git_object_compress_content = self::getGitdata($git_url);
+            $git_object_uncompress_content = gzuncompress(
+                $git_object_compress_content);
+            $git_object_content = $git_object_uncompress_content;
+        }
+        return $git_object_content;
+    }
+    /**
+     * Get the Git blob and tree objects
+     *
+     * @param string $git_object_content compressed content of git master tree
+     *    file
+     * @param string $git_base_url common content of git url
+     * @return array $blob_url contains information and url for git blob objects
+     */
+    public static function getObjects($git_object_content, $git_base_url)
+    {
+        $blob_url = [];
+        $temp_git_object_content['content'] = $git_object_content;
+        for ($i = 0; $i < strlen($git_object_content); $i++) {
+            $blob_position = strpos($temp_git_object_content['content'],
+                self::GIT_BLOB_INDICATOR);
+            $tree_position = strpos($temp_git_object_content['content'],
+                self::GIT_TREE_INDICATOR);
+            $git_object_positions = self::checkPosition($blob_position,
+                $tree_position, $git_object_content);
+            $blob_position = $git_object_positions['blob'];
+            $tree_position = $git_object_positions['tree'];
+            if ($blob_position < $tree_position) {
+                $temp_git_object_content = self::readBlobSha(
+                    $temp_git_object_content['content'], $blob_position,
+                        strlen($temp_git_object_content['content']),
+                            $git_base_url);
+            }
+            else if ($tree_position < $blob_position) {
+                $temp_git_object_content = self::readTreeSha(
+                    $temp_git_object_content['content'], $tree_position,
+                        strlen($temp_git_object_content['content']),
+                            $git_base_url);
+            }
+            $i = strlen($temp_git_object_content['content']);
+            $i = strlen($git_object_content) - $i;
+            if ($temp_git_object_content['value']['indicator'] !=
+                self::GIT_TREE_OBJECT) {
+                $blob_details[0] = $temp_git_object_content['value']['name'];
+                $blob_details[1] = $temp_git_object_content['value']['hash'];
+                $blob_details[2] = $temp_git_object_content['value']['url'];
+                $blob_url[] = $blob_details;
+            }
+            if ($temp_git_object_content['indicator'] != self::GIT_BLOB_OBJECT){
+                for ($k = 0; $k < count($temp_git_object_content['indicator']);
+                    $k++) {
+                    $blob_details[0] = $temp_git_object_content['indicator'][$k]
+                        [0];
+                    $blob_details[1] = $temp_git_object_content['indicator'][$k]
+                        [1];
+                    $blob_details[2] = $temp_git_object_content['indicator'][$k]
+                        [2];
+                    $blob_url[] = $temp_git_object_content['indicator'][$k];
+                }
+            }
+        }
+        return $blob_url;
+    }
+    /**
+     * checks the position of access code for null values
+     *
+     * @param string $git_blob_position first occuence of git blob access code
+     * @param string $git_tree_position first occuence of git tree access code
+     * @param string $git_object_content compressed content of git master tree
+     * @return array $git_object_positions length of the compressed content
+     *    afterthe access code
+     */
+    public static function checkPosition($git_blob_position, $git_tree_position,
+        $git_object_content)
+    {
+        $git_object_positions = [];
+        if (is_bool($git_blob_position) === true) {
+            $git_blob_position = strlen($git_object_content);
+        }
+        if (is_bool($git_tree_position) === true) {
+            $git_tree_position = strlen($git_object_content);
+        }
+        $git_object_positions['blob'] = $git_blob_position;
+        $git_object_positions['tree'] = $git_tree_position;
+        return $git_object_positions;
+    }
+    /**
+     * Get the details of the blob file i.e blob file name, sha hash and content
+     *
+     * @param string $git_object_content compressed content of git master tree
+     * @param string $blob_position first occuence of git blob access code
+     *    in $content
+     * @param string $length length of the compressed content of git master tree
+     * @param string $git_base_url common portion of git url
+     * @return array $git_blob_content contains details of git blob object
+     */
+    public static function readBlobSha($git_object_content, $blob_position,
+        $length, $git_base_url)
+    {
+        $git_blob_content = [];
+        $blob_values = [];
+        $temp_git_content = substr($git_object_content, $blob_position,
+            $length);
+        $access_code = substr($temp_git_content, self::BLOB_ACCESS_CODE_START,
+            self::BLOB_ACCESS_CODE_END);
+        $blob_values['code'] = $access_code;
+        $temp_git_content = substr($temp_git_content, self::GIT_BLOB_NEXT,
+            $length);
+        $temp_position = strpos($temp_git_content, self::HEX_NULL_CHARACTER);
+        $file_name = substr($temp_git_content, self::GIT_NAME_START,
+            $temp_position);
+        $blob_values['name'] = $file_name;
+        $temp_git_content = substr($temp_git_content, $temp_position + 1,
+            $length);
+        $sha_binary = substr($temp_git_content, self::SHA_HASH_BINARY_START,
+            self::SHA_HASH_BINARY_END);
+        $sha_hash = bin2hex($sha_binary);
+        $blob_values['hash'] = $sha_hash;
+        $temp_git_content = substr($temp_git_content, self::SHA_HASH_BINARY_END,
+            $length);
+        $blob_url = self::urlMaker($sha_hash, $git_base_url);
+        $blob_values['url'] = $blob_url;
+        $blob_values['indicator'] = self::GIT_BLOB_OBJECT;
+        $git_blob_content['value'] = $blob_values;
+        $git_blob_content['content'] = $temp_git_content;
+        $git_blob_content['indicator'] = self::GIT_BLOB_OBJECT;
+        return $git_blob_content;
+    }
+    /**
+     * Get the details of the tree file i.e folder name, sha hash and
+     * blob url inside the tree
+     *
+     * @param string $git_object_content compressed content of git master tree
+     * @param string $tree_position first occuence of git tree access code in
+     * the $content
+     * @param string $length length of the compressed content of git master tree
+     * @param string $git_base_url common portion of git url
+     * @return array $git_tree_content contains details of git blob object
+     */
+    public static function readTreeSha($git_object_content, $tree_position,
+        $length, $git_base_url)
+    {
+        $git_tree_content = [];
+        $tree_values = [];
+        $temp_git_content = substr($git_object_content, $tree_position,
+            $length);
+        $access_code = substr($temp_git_content, self::TREE_ACCESS_CODE_START,
+            self::TREE_ACCESS_CODE_END);
+        $tree_values['code'] = $access_code;
+        $temp_git_content = substr($temp_git_content, self::GIT_TREE_NEXT,
+            $length);
+        $temp_position = strpos($temp_git_content, self::HEX_NULL_CHARACTER);
+        $folder_name = substr($temp_git_content, self::GIT_NAME_START,
+            $temp_position);
+        $tree_values['name'] = $folder_name;
+        $temp_git_content = substr($temp_git_content, $temp_position + 1,
+            $length);
+        $sha_binary = substr($temp_git_content, self::SHA_HASH_BINARY_START,
+            self::SHA_HASH_BINARY_END);
+        $sha_hash = bin2hex($sha_binary);
+        $tree_values['hash'] = $sha_hash;
+        $tree_values['indicator'] = self::GIT_TREE_OBJECT;
+        $temp_git_content = substr($temp_git_content, self::SHA_HASH_BINARY_END,
+            $length);
+        $blob_url = self::checkNestedStructure($sha_hash, $git_base_url);
+        $git_tree_content['value'] = $tree_values;
+        $git_tree_content['content'] = $temp_git_content;
+        $git_tree_content['indicator'] = $blob_url;
+        return $git_tree_content;
+    }
+    /**
+     * Checks the nested structure inside git tree object
+     *
+     * @param string $sha_hash sha of the git tree object
+     * @param string $git_base_url common portion of the parent git url
+     * @return string $blob_url contains url of the blob file inside the folder
+     */
+    public static function checkNestedStructure($sha_hash, $git_base_url)
+    {
+        $url = self::urlMaker($sha_hash, $git_base_url);
+        $git_compressed_content = self::getGitData($url);
+        $git_uncompressed_content = gzuncompress($git_compressed_content);
+        $blob_url = self::getObjects($git_uncompressed_content, $git_base_url);
+        return $blob_url;
+    }
+    /**
+     * Makes the git clone internal url for blob objects
+     *
+     * @param string $sha_hash of the git blob object
+     * @param string $git_base_url common portion of git url
+     * @return string $git_object_url contains the complete url of the blob file
+     */
+    public static function urlMaker($sha_hash, $git_base_url)
+    {
+        $git_object_folder = substr($sha_hash, self::GIT_FOLDER_NAME_START,
+            self::GIT_FOLDER_NAME_END);
+        $git_object_file = substr($sha_hash, self::GIT_FILE_NAME_START,
+            self::GIT_FILE_NAME_END);
+        $git_object_path = $git_object_folder . self::GIT_URL_SPLIT .
+            $git_object_file;
+        $git_object_url = $git_base_url . self::GIT_URL_OBJECT .
+            $git_object_path;
+        return $git_object_url;
+    }
+    /**
+     * Makes the cURL call to get the contents
+     *
+     * @param string $git_url url to dowmload the contents
+     * @return string $git_content actual content of the git url
+     */
+    public static function getGitData($git_url)
+    {
+        $ch = curl_init();
+        $timeout = self::CURL_TIMEOUT;
+        curl_setopt($ch, CURLOPT_URL, $git_url);
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, self::CURL_TRANSFER);
+        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
+        $git_content = curl_exec($ch);
+        curl_close($ch);
+        return $git_content;
+    }
+}
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@ -0,0 +1,803 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\UrlParser;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ *
+ * Code used to manage HTTP or Gopher requests from one or more URLS
+ *
+ * @author Chris Pollett
+ */
+class FetchUrl implements CrawlConstants
+{
+    /**
+     * Make multi_curl requests for an array of sites with urls or onion urls
+     *
+     * @param array $sites  an array containing urls of pages to request
+     * @param bool $timer  flag, true means print timing statistics to log
+     * @param int $page_range_request maximum number of bytes to download/page
+     *     0 means download all
+     * @param string $temp_dir folder to store temporary ip header info
+     * @param string $key  the component of $sites[$i] that has the value of
+     *     a url to get defaults to URL
+     * @param string $value component of $sites[$i] in which to store the
+     *     page that was gotten
+     * @param bool $minimal if true do a faster request of pages by not
+     *     doing things like extract HTTP headers sent, etcs
+     * @param array $post_data data to be POST'd to each site
+     * @param bool $follow whether to follow redirects or not
+     * @param string $tor_proxy url of a proxy that knows how to download
+     *     .onion urls
+     * @param array $proxy_servers if not [], then an array of proxy
+     *     server to use rather than to directly download web pages from
+     *     the current machine
+     *
+     * @return array an updated array with the contents of those pages
+     */
+    public static function getPages($sites, $timer = false,
+        $page_range_request = C\PAGE_RANGE_REQUEST, $temp_dir = null,
+        $key=CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal=false,
+        $post_data = null, $follow = false, $tor_proxy = "",
+        $proxy_servers=[])
+    {
+        static $agent_handler = null;
+        static $handler_time = 0;
+        if (empty($agent_handler)) {
+            /* try to keep handler around between calls to allow for connection
+                reuse
+             */
+            $agent_handler = curl_multi_init();
+            $handler_time = microtime(true);
+        }
+        $active = null;
+        $start_time = microtime(true);
+        if (!$minimal && $temp_dir == null) {
+            $temp_dir = C\CRAWL_DIR."/temp";
+            if (!file_exists($temp_dir)) {
+                mkdir($temp_dir);
+            }
+        }
+        //Set-up requests
+        $num_sites = count($sites);
+        for ($i = 0; $i < $num_sites; $i++) {
+            $is_gopher = false;
+            $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
+            if (isset($sites[$i][$key])) {
+                list($sites[$i][$key], $url, $headers) =
+                    self::prepareUrlHeaders($sites[$i][$key], $minimal,
+                    $proxy_servers);
+                if ($headers == "gopher") {
+                    $is_gopher = true;
+                    $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
+                    $headers = [];
+                }
+                $sites[$i][0] = curl_init();
+                if (!$minimal) {
+                    $ip_holder[$i] = fopen("$temp_dir/tmp$i.txt", 'w+');
+                    curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
+                    curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
+                }
+                curl_setopt($sites[$i][0], CURLOPT_USERAGENT, C\USER_AGENT);
+                curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE,
+                    CURL_IPRESOLVE_WHATEVER);
+                curl_setopt($sites[$i][0], CURLOPT_URL, $url);
+                if (strcmp(substr($url,-10), "robots.txt") == 0 ) {
+                    $sites[$i]['ROBOT'] = true;
+                    $follow = true; /*wikipedia redirects their robot page. grr
+                                      want to force this for robots pages
+                                    */
+                }
+                curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
+                curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
+                curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYPEER, false);
+                curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
+                curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
+                curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT,
+                    C\PAGE_TIMEOUT);
+                curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, C\PAGE_TIMEOUT);
+                if (stripos($url,'.onion') !== false && $tor_proxy != "") {
+                    curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
+                    //CURLPROXY_SOCKS5_HOSTNAME = 7
+                    curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
+                    if ($timer) {
+                        crawlLog("Using Tor proxy for $url..");
+                    }
+                } else if ($proxy_servers != [] && !$is_gopher) {
+                    $select_proxy = rand(0, count($proxy_servers) - 1);
+                    $proxy_server = $proxy_servers[$select_proxy];
+                    $proxy_parts = explode(":", $proxy_server);
+                    $proxy_ip = $proxy_parts[0];
+                    if (!isset($proxy_parts[2]) ||
+                        strtolower($proxy_parts[2]) == 'http') {
+                        $proxy_type = CURLPROXY_HTTP;
+                    } else if (strtolower($proxy_parts[2]) == 'socks5') {
+                        $proxy_type = CURLPROXY_SOCKS5;
+                    } else {
+                        $proxy_type = $proxy_parts[2];
+                    }
+                    if (isset($proxy_parts[1])) {
+                        $proxy_port = $proxy_parts[1];
+                    } else {
+                        $proxy_port = "80";
+                    }
+                    curl_setopt($sites[$i][0], CURLOPT_PROXY,
+                        "$proxy_ip:$proxy_port");
+                    curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE,
+                        $proxy_type);
+                    if ($timer) {
+                        crawlLog("Selecting proxy $select_proxy for $url");
+                    }
+                }
+                if (!$minimal) {
+                    curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
+                }
+                //make lighttpd happier
+                if (!$is_gopher) {
+                    curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER,
+                        $headers);
+                }
+                curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
+                   // ^ need to set for sites like att that use gzip
+                if ($page_range_request > 0 && empty(
+                    $sites[$i][CrawlConstants::NO_RANGE])) {
+                    curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-".
+                        $page_range_request);
+                } else if (!empty( $sites[$i][CrawlConstants::NO_RANGE])) {
+                    crawlLog("No range used for $url");
+                }
+                if ($post_data != null) {
+                    curl_setopt($sites[$i][0], CURLOPT_POST, true);
+                    curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS,
+                        $post_data[$i]);
+                }
+                curl_multi_add_handle($agent_handler, $sites[$i][0]);
+            }
+        }
+        if ($timer) {
+            crawlLog("  Init Get Pages ".(changeInMicrotime($start_time)));
+        }
+        $start_time = microtime(true);
+        $start = time();
+        //Wait for responses
+        $running = null;
+        $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
+        $mrc_check = CURLM_CALL_MULTI_PERFORM;
+        restore_error_handler();
+        do {
+            $mrc = @curl_multi_exec($agent_handler, $running);
+            /* 0.05 is to prevent this from being too busy a loop sucking
+               up CPU cycle. We check every 0.05 if another page is ready of
+               not*/
+            if ($mrc != CURLM_CALL_MULTI_PERFORM) {
+                $mrc_check = CURLM_OK;
+                $ready = curl_multi_select($agent_handler, 0.05);
+            }
+        } while (memory_get_usage() < $memory_limit && $mrc == $mrc_check &&
+            time() - $start < C\PAGE_TIMEOUT &&  $running > 0);
+        set_error_handler(C\NS_LIB . "yioop_error_handler");
+        if (time() - $start > C\PAGE_TIMEOUT && $timer) {
+            crawlLog("  TIMED OUT!!!");
+        }
+        if ($timer) {
+            crawlLog("  Page Request time ".(changeInMicrotime($start_time)));
+        }
+        $start_time = microtime(true);
+        //Process returned pages
+        for ($i = 0; $i < $num_sites; $i++) {
+            if ($timer) {
+                crawlTimeoutLog("FetchUrl initial processing of page %s of %s",
+                    $i, $num_sites);
+            }
+            if (!$minimal && isset($ip_holder[$i]) ) {
+                rewind($ip_holder[$i]);
+                $header = fread($ip_holder[$i], 8192);
+                $ip_addresses = self::getCurlIp($header);
+                fclose($ip_holder[$i]);
+            }
+            $is_gopher = false;
+            if (!empty($sites[$i][0])) {
+                // Get Data and Message Code
+                $content = @curl_multi_getcontent($sites[$i][0]);
+                $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
+                /*
+                    If the Transfer-encoding was chunked then the Range header
+                    we sent was ignored. So we manually truncate the data
+                    here
+                 */
+                if ($page_range_request > 0) {
+                    $init_len = strlen($content);
+                    $content = substr($content, 0, $page_range_request);
+                    if (strlen($content) != $init_len) {
+                        $sites[$i][CrawlConstants::CONTENT_SIZE] = $init_len;
+                    }
+                }
+                if (isset($content) && !$minimal && !$is_gopher) {
+                    $site = self::parseHeaderPage($content, $value);
+                    $sites[$i] = array_merge($sites[$i], $site);
+                    if (isset($header)) {
+                        $header = substr($header, 0,
+                            strpos($header, "\x0D\x0A\x0D\x0A") + 4);
+                    } else {
+                        $header = "";
+                    }
+                    $sites[$i][CrawlConstants::HEADER] =
+                        $header . $sites[$i][CrawlConstants::HEADER];
+                    unset($header);
+                } else if (isset($content) && !$minimal && $is_gopher) {
+                    $sites[$i][CrawlConstants::HEADER] =
+                        $header;
+                    $sites[$i][$value] = $content;
+                    unset($header);
+                } else {
+                    $sites[$i][$value] = $content;
+                }
+                if (!$minimal) {
+                    $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0],
+                        CURLINFO_SIZE_DOWNLOAD);
+                    $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0],
+                        CURLINFO_NAMELOOKUP_TIME);
+                    $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0],
+                        CURLINFO_TOTAL_TIME);
+                    $sites[$i][self::HTTP_CODE] =
+                        curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
+                    if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
+                        $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
+                    } else if ($is_gopher) {
+                        $sites[$i][self::HTTP_CODE] = 200;
+                    }
+                    if ($ip_addresses) {
+                        $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
+                    } else {
+                        $sites[$i][self::IP_ADDRESSES] = ["0.0.0.0"];
+                    }
+                    //Get Time, Mime type and Character encoding
+                    $sites[$i][self::TIMESTAMP] = time();
+                    if ($is_gopher) {
+                        $path = UrlParser::getPath($sites[$i][self::URL]);
+                        $filename =
+                            UrlParser::getDocumentFilename(
+                                $sites[$i][self::URL]);
+                        if (isset($path[1])) {
+                            $gopher_type = $path[1];
+                        } else {
+                            $gopher_type = 1;
+                        }
+                        if ($gopher_type == 1) {
+                            $sites[$i][self::TYPE] = "text/gopher";
+                        } else if (in_array($gopher_type,
+                            [0, 3, 6])) {
+                            $sites[$i][self::TYPE] = "text/plain";
+                            if ($gopher_type == 6) {
+                                $sites[$i][$value] = convert_uudecode(
+                                    $content);
+                            }
+                        } else if ($gopher_type == 'h') {
+                            $sites[$i][self::TYPE] = "text/html";
+                        } else if ($gopher_type == 'g') {
+                            $sites[$i][self::TYPE] = "image/gif";
+                        }
+                        $path_info = pathinfo($filename);
+                        if (!isset($sites[$i][self::TYPE]) &&
+                            isset($path_info['extension'])) {
+                            $sites[$i][self::TYPE] =
+                                UrlParser::guessMimeTypeFromFileName($filename);
+                        } else if (!isset($sites[$i][self::TYPE])) {
+                            $sites[$i][self::TYPE] = "unknown";
+                        }
+                    } else {
+                        $type_parts =
+                            explode(";", curl_getinfo($sites[$i][0],
+                                CURLINFO_CONTENT_TYPE));
+                        $sites[$i][self::TYPE] =
+                            strtolower(trim($type_parts[0]));
+                    }
+                }
+                 /*
+                   Ideally should have line for all requests
+                   However, this seems to cause curl to sometimes crash
+                   by trying to free stuff twice on some linux systems
+                   at crawl time. Not having it on other systems causes crashes
+                   at query time
+                */
+                if ($minimal || !stristr(PHP_OS, "LINUX")) {
+                    curl_multi_remove_handle($agent_handler, $sites[$i][0]);
+                }
+                curl_close($sites[$i][0]);
+                if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) {
+                    if (isset($sites[$i][self::TYPE]) &&
+                        $sites[$i][self::TYPE] != "text/plain" &&
+                        isset($sites[$i][CrawlConstants::LOCATION]) &&
+                        count($site[CrawlConstants::LOCATION]) > 0) {
+                        $sites[$i][self::TYPE] = "text/plain";
+                        $sites[$i][self::HTTP_CODE] = "200";
+                        $tmp = wordwrap($sites[$i][$value], 80);
+                        $tmp_parts = explode("\n", $tmp);
+                        $tmp = "# Suspect server misconfiguration\n";
+                        $tmp .= "# Assume shouldn't crawl this site.\n";
+                        $tmp .= "# Pretending got following robots.txt.\n";
+                        $tmp .= "User-agent: *\n";
+                        $tmp .= "Disallow: /\n";
+                        $tmp .= "# Original error code: ".
+                            $sites[$i][self::HTTP_CODE]."\n";
+                        $tmp .= "# Original content:\n";
+                        foreach ($tmp_parts as $part) {
+                            $tmp = "#".$part."\n";
+                        }
+                        $sites[$i][$value] = $tmp;
+                        $sites[$i][self::HTTP_CODE] = "200";
+                        unset($site[CrawlConstants::LOCATION]);
+                    }
+                }
+            } //end big if
+        } //end for
+        if ($timer) {
+            crawlLog("  Get Page Content time ".
+                (changeInMicrotime($start_time)));
+        }
+        if (microtime(true) - $handler_time > C\PAGE_TIMEOUT) {
+            if (!empty($agent_handler)) {
+                curl_multi_close($agent_handler);
+            }
+            $agent_handler = null;
+        }
+        return $sites;
+    }
+    /**
+     * Curl requests are typically done using cache data which is stored
+     * after ### at the end of urls if this is possible. To make this
+     * work. The http Host: with the url is added a header after the
+     * for the curl request. The job of this function is to do this replace
+     * @param string $url site to download with ip address at end potentially
+     *  afte ###
+     * @param bool $minimal don't try to do replacement, but do add an Expect
+     *     header
+     * @param array $proxy_servers if not empty an array of proxy servers
+     *     used to crawl through
+     * @return array 3-tuple (orig url, url with replacement, http header array)
+     */
+    public static function prepareUrlHeaders($url, $minimal = false,
+        $proxy_servers = [])
+    {
+        $url = str_replace("&amp;", "&", $url);
+        $is_gopher = false;
+        if (substr($url, 0, 6) == "gopher") {
+            $is_gopher = true;
+        }
+        /*Check if an ETag was added by the queue server. If found, create
+          If-None_Match header with the ETag and add it to the headers. Remove
+          ETag from URL
+         */
+        $if_none_match = "If-None-Match";
+        $etag = null;
+        if (C\USE_ETAG_EXPIRES && stristr($url, "ETag:")) {
+            $etag_parts = preg_split("/ETag\:/i", $url);
+            $etag_data = explode(" ", $etag_parts[1]);
+            $etag = $etag_data[1];
+            $pos = strrpos($url, "ETag:");
+            $url = substr_replace($url, "", $pos, strlen("ETag: ".$etag));
+        }
+        /* in queue_server we added the ip (if available)
+          after the url followed by ###
+         */
+        $headers = [];
+        if (!$minimal) {
+            $url_ip_parts = explode("###", $url);
+            if ($proxy_servers != [] || (isset($url_ip_parts[0]) &&
+                (stripos($url_ip_parts[0],'.onion') !== false)) ) {
+                $url_ip_parts = [$url_ip_parts[0]];
+                $url = $url_ip_parts[0];
+            }
+            if (count($url_ip_parts) > 1) {
+                $ip_address = ltrim(urldecode(array_pop($url_ip_parts)), "#");
+                $len = strlen(inet_pton($ip_address));
+                if ($len == 4 || $len == 16) {
+                    if ($len == 16) {
+                      $ip_address= "[$ip_address]";
+                    }
+                    if (count($url_ip_parts) > 1) {
+                        $url = implode("###", $url_ip_parts);
+                    } else {
+                        $url = $url_ip_parts[0];
+                    }
+                    $url_parts = @parse_url($url);
+                    if (isset($url_parts['host'])) {
+                        $cnt = 1;
+                        $url_with_ip_if_possible =
+                            str_replace($url_parts['host'], $ip_address ,$url,
+                                 $cnt);
+                        if ($cnt != 1) {
+                            $url_with_ip_if_possible = $url;
+                        } else {
+                            $headers[] = "Host:".$url_parts['host'];
+                        }
+                    }
+                } else {
+                    $url_with_ip_if_possible = $url;
+                }
+            } else {
+                $url_with_ip_if_possible = $url;
+            }
+        } else {
+            $url_with_ip_if_possible = $url;
+        }
+        $headers[] = 'Expect:';
+        if (C\USE_ETAG_EXPIRES && $etag !== null) {
+            $etag_header = $if_none_match.": ".$etag;
+            $headers[] = $etag_header;
+        }
+        if ($is_gopher) {
+            $headers = "gopher";
+        }
+        $results = [$url, $url_with_ip_if_possible, $headers];
+        return $results;
+    }
+    /**
+     * Computes a hash of a string containing page data for use in
+     * deduplication of pages with similar content
+     *
+     * @param string& $page reference to web page data
+     * @return string 8 byte hash to identify page contents
+     */
+    public static function computePageHash(&$page)
+    {
+        /* to do dedup we strip script, noscript, and style tags
+           as well as their content, then we strip tags, get rid
+           of whitespace and hash
+         */
+        $strip_array =
+            ['@<script[^>]*?>.*?</script>@si',
+                '@<noscript[^>]*?>.*?</noscript>@si',
+                '@<style[^>]*?>.*?</style>@si'];
+        $dedup_string = preg_replace(
+            $strip_array, '', $page);
+        $dedup_string_old = preg_replace(
+            '/\W+/', '', $dedup_string);
+        $dedup_string = strip_tags($dedup_string_old);
+        if ($dedup_string == "") {
+            $dedup_string = $dedup_string_old;
+        }
+        $dedup_string = preg_replace(
+            '/\W+/', '', $dedup_string);
+        return crawlHash($dedup_string, true);
+    }
+    /**
+     * Splits an http response document into the http headers sent
+     * and the web page returned. Parses out useful information from
+     * the header and return an array of these two parts and the useful info.
+     *
+     * @param string $header_and_page string of downloaded data
+     * @param string $value field to store the page portion of page
+     * @return array info array consisting of a header, page for an http
+     *     response, as well as parsed from the header the server, server
+     *     version, operating system, encoding, and date information.
+     */
+    public static function parseHeaderPage($header_and_page,
+        $value=CrawlConstants::PAGE)
+    {
+        $cache_page_validators = [];
+        $cache_page_validators['etag'] = -1;
+        $cache_page_validators['expires'] = -1;
+        $new_offset = 0;
+        // header will include all redirect headers
+        $site = [];
+        $site[CrawlConstants::LOCATION] = [];
+        do {
+            $continue = false;
+            $CRLFCRLF = strpos($header_and_page, "\x0D\x0A\x0D\x0A",
+                $new_offset);
+            $LFLF = strpos($header_and_page, "\x0A\x0A", $new_offset);
+            //either two CRLF (what spec says) or two LF's to be safe
+            $old_offset = $new_offset;
+            $header_offset = ($CRLFCRLF > 0) ? $CRLFCRLF : $LFLF;
+            $header_offset = ($header_offset) ? $header_offset : 0;
+            $new_offset = ($CRLFCRLF > 0) ? $header_offset + 4
+                : $header_offset + 2;
+            $redirect_pos = stripos($header_and_page, 'Location:', $old_offset);
+            $redirect_str = "Location:";
+            if ($redirect_pos === false) {
+                $redirect_pos =
+                    stripos($header_and_page, 'Refresh:', $old_offset);
+                $redirect_str = "Refresh:";
+            }
+            if (isset($header_and_page[$redirect_pos - 1]) &&
+                ord($header_and_page[$redirect_pos - 1]) > 32) {
+                $redirect_pos = $new_offset; //ignore X-XRDS-Location header
+            } else if ($redirect_pos !== false && $redirect_pos < $new_offset){
+                $redirect_pos += strlen($redirect_str);
+                $pre_line = substr($header_and_page, $redirect_pos,
+                    strpos($header_and_page, "\n", $redirect_pos) -
+                    $redirect_pos);
+                $loc = @trim($pre_line);
+                if (strlen($loc) > 0) {
+                    $site[CrawlConstants::LOCATION][] = @$loc;
+                }
+                $continue = true;
+            }
+        } while($continue);
+        if ($header_offset > 0) {
+            $site[CrawlConstants::HEADER] =
+                substr($header_and_page, 0, $header_offset);
+            $site[$value] = ltrim(substr($header_and_page, $header_offset));
+        } else { //header message no body; maybe 301?
+            $site[CrawlConstants::HEADER] = $header_and_page;
+            $site[$value] = " ";
+        }
+        $lines = explode("\n", $site[CrawlConstants::HEADER]);
+        $first_line = array_shift($lines);
+        $response = preg_split("/(\s+)/", $first_line);
+        $site[CrawlConstants::HTTP_CODE] = isset($response[1]) ?
+             @trim($response[1]) : 0;
+        $site[CrawlConstants::ROBOT_METAS] = [];
+        foreach ($lines as $line) {
+            $line = trim($line);
+            if (stristr($line, 'Server:')) {
+                $server_parts = preg_split("/Server\:/i", $line);
+                $server_name_parts = @explode("/", $server_parts[1]);
+                $site[CrawlConstants::SERVER] = @trim($server_name_parts[0]);
+                if (isset($server_name_parts[1])) {
+                    $version_parts = explode("(", $server_name_parts[1]);
+                    $site[CrawlConstants::SERVER_VERSION] =
+                        @trim($version_parts[0]);
+                    if (isset($version_parts[1])) {
+                        $os_parts = explode(")", $version_parts[1]);
+                        $site[CrawlConstants::OPERATING_SYSTEM] =
+                            @trim($os_parts[0]);
+                    }
+                }
+            }
+            if (stristr($line, 'Content-type:')) {
+                list(,$mimetype,) = preg_split("/:|;/i", $line);
+                $site[CrawlConstants::TYPE] = trim($mimetype);
+            }
+            if (stristr($line, 'charset=')) {
+                $line_parts = preg_split("/charset\=/i", $line);
+                $site[CrawlConstants::ENCODING] =
+                    strtoupper(@trim($line_parts[1]));
+            }
+            if (stristr($line, 'Last-Modified:')) {
+                $line_parts = preg_split("/Last\-Modified\:/i", $line);
+                $site[CrawlConstants::MODIFIED] =
+                    strtotime(@trim($line_parts[1]));
+            }
+            if (stristr($line, 'X-Robots-Tag:')) { // robot directives pdfs etc
+                $line_parts = preg_split("/X\-Robots\-Tag\:/i", $line);
+                $robot_metas = explode(",", $line_parts[1]);
+                foreach ($robot_metas as $robot_meta) {
+                    $site[CrawlConstants::ROBOT_METAS][] = strtoupper(
+                        trim($robot_meta));
+                }
+            }
+            if (stristr($line, 'Content-Range:')) {
+                $line_parts = explode("/", $line);
+                if (!empty($line_parts[1])) {
+                    $content_size = intval(trim($line_parts[1]));
+                    if ($content_size > 0) {
+                        $site[CrawlConstants::CONTENT_SIZE] = $content_size;
+                    }
+                }
+            }
+            $canonical_regex = "/Link\:\s*\<\s*(http.*)\s*\>\s*\;\s*".
+                "rel\s*\=\s*(\"|')?canonical(\"|')?/";
+            // levenshtein gives notices on strings longer than 255
+            if (preg_match($canonical_regex, $line, $matches) &&
+                isset($site[CrawlConstants::URL]) && strlen($matches[1]) < 252
+                && (strlen($site[CrawlConstants::URL]) >= 255 ||
+                levenshtein($matches[1], $site[CrawlConstants::URL]) > 3)) {
+                // for rel canonical headers
+                $site[CrawlConstants::LOCATION][] = $matches[1];
+                $site[CrawlConstants::ROBOT_METAS][] = 'NOFOLLOW';
+            }
+            if (C\USE_ETAG_EXPIRES && stristr($line, 'ETag:')) {
+                $line_parts = preg_split("/ETag\:/i", $line);
+                if (isset($line_parts[1])) {
+                    $etag_data = explode(" ", $line_parts[1]);
+                    if (isset($etag_data[1])) {
+                        $etag = $etag_data[1];
+                        $cache_page_validators['etag'] = $etag;
+                    }
+                }
+            }
+            if (C\USE_ETAG_EXPIRES && stristr($line, 'Expires:')) {
+                $line_parts = preg_split("/Expires\:/i", $line);
+                $all_dates = $line_parts[1];
+                $date_parts = explode(",", $all_dates);
+                if (count($date_parts) == 2) {
+                    $cache_page_validators['expires'] = strtotime(
+                        $date_parts[1]);
+                } else if (count($date_parts) > 2) {
+                    /*Encountered some pages with more than one Expires date
+                      :O */
+                    $timestamps = [];
+                    for ($i = 1;$i < count($date_parts); $i += 2) {
+                        $ds = strtotime($date_parts[$i]);
+                        $timestamps[] = $ds;
+                    }
+                    $lowest = min($timestamps);
+                    $cache_page_validators['expires'] = $lowest;
+                }
+            }
+            if (C\USE_ETAG_EXPIRES && !($cache_page_validators['etag'] == -1 &&
+                $cache_page_validators['expires'] == -1)) {
+                $site[CrawlConstants::CACHE_PAGE_VALIDATORS] =
+                    $cache_page_validators;
+            }
+        }
+        /*
+           If the doc is HTML and it uses a http-equiv to set the encoding
+           then we override what the server says (if anything). As we
+           are going to convert to UTF-8 we remove the charset info
+           from the meta tag so cached pages will display correctly and
+           redirects without char encoding won't be given a different hash.
+         */
+        $encoding_info = guessEncodingHtml($site[$value], true);
+        if (is_array($encoding_info)) {
+            list($site[CrawlConstants::ENCODING], $start_charset, $len_c) =
+            $encoding_info;
+            $site[$value] = substr_replace($site[$value], "", $start_charset,
+                $len_c);
+        } else {
+            $site[CrawlConstants::ENCODING] = $encoding_info;
+        }
+        if (!isset($site[CrawlConstants::SERVER]) ) {
+            $site[CrawlConstants::SERVER] = "unknown";
+        }
+        return $site;
+    }
+    /**
+     * Computes the IP address from http get-responser header
+     *
+     * @param string $header contains complete transcript of HTTP get/response
+     * @return string IPv4 address as a string of dot separated quads.
+     */
+    public static function getCurlIp($header)
+    {
+        if (preg_match_all('/Trying\s+(.*)(\.\.\.)/',
+            $header, $matches)) {
+            $out_addresses = [];
+            $addresses = array_unique($matches[1]);
+            foreach ($addresses as $address) {
+                $num = @inet_pton($address);
+                if ($num !== false) {
+                    $out_addresses[] = $address;
+                }
+            }
+            if ($out_addresses != []) {
+                return $out_addresses;
+            }
+            return false;
+        } else {
+            return false;
+        }
+    }
+    /**
+     * Make a curl request for the provided url
+     *
+     * @param string $site url of page to request
+     * @param array $post_data  any data to be POST'd to the URL
+     * @param bool $check_for_errors whether or not to check the response
+     *      for the words, NOTICE, WARNING, FATAL which might indicate an
+     *      error on the server
+     * @param string $user_password username:password to use for connection if
+     *      needed (optional)
+     * @return string the contents of what the curl request fetched
+     */
+    public static function getPage($site, $post_data = null,
+        $check_for_errors = false, $user_password = null)
+    {
+        static $agents = [];
+        $not_web_setting = (php_sapi_name() == 'cli');
+        $MAX_SIZE = 50;
+        $host = @parse_url($site, PHP_URL_HOST);
+        if ($host !== false) {
+            if (count($agents) > $MAX_SIZE) {
+                $agent_host = array_shift($agents);
+                if ($agent_host) {
+                    curl_close($agent_host);
+                }
+            }
+            if (empty($agents[$host])) {
+                $agents[$host] = curl_init();
+            }
+        }
+        if ($not_web_setting) {
+            crawlLog("  Init curl request of a single page");
+        }
+        curl_setopt($agents[$host], CURLOPT_USERAGENT, C\USER_AGENT);
+        curl_setopt($agents[$host], CURLOPT_URL, $site);
+        curl_setopt($agents[$host], CURLOPT_AUTOREFERER, true);
+        curl_setopt($agents[$host], CURLOPT_FOLLOWLOCATION, true);
+        // these next two lines should probably be modified for better security
+        curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 0);
+        curl_setopt($agents[$host], CURLOPT_SSL_VERIFYPEER, false);
+        curl_setopt($agents[$host], CURLOPT_NOSIGNAL, true);
+        curl_setopt($agents[$host], CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($agents[$host], CURLOPT_FAILONERROR, true);
+        curl_setopt($agents[$host], CURLOPT_TIMEOUT, C\SINGLE_PAGE_TIMEOUT);
+        curl_setopt($agents[$host], CURLOPT_CONNECTTIMEOUT, C\PAGE_TIMEOUT);
+        //make lighttpd happier
+        curl_setopt($agents[$host], CURLOPT_HTTPHEADER, ['Expect:']);
+        if ($post_data != null) {
+            curl_setopt($agents[$host], CURLOPT_POST, true);
+            curl_setopt($agents[$host], CURLOPT_POSTFIELDS, $post_data);
+        } else {
+            // since we are caching agents, need to do this so doesn't get stuck
+            // as post and so query string ignored for get's
+            curl_setopt($agents[$host], CURLOPT_HTTPGET, true);
+        }
+        if ($user_password != null) {
+            curl_setopt($agents[$host], CURLOPT_FAILONERROR, false);
+            curl_setopt($agents[$host], CURLOPT_USERPWD, $user_password);
+            curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 2);
+            curl_setopt($agents[$host], CURLOPT_SSLVERSION,
+                CURL_SSLVERSION_TLSv1_2);
+        }
+        if ($not_web_setting) {
+            crawlLog("  Set curl options for single page request");
+        }
+        $time = time();
+        $response = curl_exec($agents[$host]);
+        if (time() - $time > C\PAGE_TIMEOUT && $not_web_setting) {
+            crawlLog("  Request took longer than page timeout!!");
+            crawlLog("  Either could not reach URL or website took too");
+            crawlLog("  long to respond.");
+        }
+        curl_setopt($agents[$host], CURLOPT_POSTFIELDS, "");
+        if ($not_web_setting) {
+            crawlLog("  Done curl exec");
+        }
+        if ($not_web_setting && $check_for_errors) {
+            self::checkResponseForErrors($response);
+        }
+        return $response;
+    }
+    /**
+     * Given the results of a getPage call, check whether or not the response
+     * had the words NOTICE, WARNING, FATAL which might indicate an error on
+     * the server. If it does, then the $response string is sent to the
+     * crawlLog
+     *
+     * @param string $response getPage response in which to check for errors
+     */
+    public static function checkResponseForErrors($response)
+    {
+        if (preg_match("/NOTICE|WARNING|FATAL/i", $response)) {
+            crawlLog("There appears to have been an error in the server ".
+                "response. Response was:");
+            crawlLog(wordwrap($response));
+        }
+    }
+}
--- a/src/library/FileCache.php
+++ b/src/library/FileCache.php
@ -0,0 +1,156 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\models\datasources as D;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Library of functions used to implement a simple file cache
+ *
+ * @author Chris Pollett
+ */
+class FileCache
+{
+    /**
+     * File used to serve last cache request
+     * @var string
+     */
+    public $cache_file;
+    /**
+     * Folder name to use for this FileCache
+     * @var string
+     */
+    public $dir_name;
+    /**
+     * Total number of bins to cycle between
+     */
+    const NUMBER_OF_BINS = 24;
+    /**
+     * Maximum number of files in a bin
+     */
+    const MAX_FILES_IN_A_BIN = 10000;
+    /**
+     * Creates the directory for the file cache, sets how frequently
+     * all items in the cache expire
+     *
+     * @param string $dir_name folder name of where to put the file cache
+     */
+    public function __construct($dir_name)
+    {
+        $this->dir_name = $dir_name;
+
+        if (!is_dir($this->dir_name)) {
+            mkdir($this->dir_name);
+            $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager";
+            $db = new $db_class();
+            $db->setWorldPermissionsRecursive($this->dir_name, true);
+        }
+    }
+    /**
+     * Retrieve data associated with a key that has been put in the cache
+     *
+     * @param string $key the key to look up
+     * @return mixed the data associated with the key if it exists, false
+     *     otherwise
+     */
+    public function get($key)
+    {
+        $checksum_block = $this->checksum($key);
+        $this->cache_file = $this->dir_name . "/$checksum_block/" .
+            webencode($key);
+        if (file_exists($this->cache_file)) {
+            return unserialize(file_get_contents($this->cache_file));
+        }
+        return false;
+    }
+    /**
+     * Stores in the cache a key-value pair
+     *
+     * Only when a key is set is there a check for whether to invalidate
+     * a cache bin. It is deleted as invalid if the following two conditions
+     * both hold:
+     * The last time it was expired is more than SECONDS_IN_A_BIN seconds ago,
+     * and the number of cache items is more than self::MAX_FILES_IN_A_BIN.
+     *
+     * @param string $key to associate with value
+     * @param mixed $value to store
+     */
+    public function set($key, $value)
+    {
+        $checksum_block = $this->checksum($key);
+        $checksum_dir = $this->dir_name."/$checksum_block";
+        if (file_exists("$checksum_dir/last_expired.txt")) {
+            $data =
+                unserialize(
+                    file_get_contents("$checksum_dir/last_expired.txt"));
+        }
+        if (!isset($data['last_expired'])) {
+            $data = ['last_expired' => time(), 'count' => 0];
+        }
+        if ($data['count'] > self::MAX_FILES_IN_A_BIN) {
+            $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
+            $db = new $db_class();
+            $db->unlinkRecursive($checksum_dir);
+        }
+        if (!file_exists($checksum_dir)) {
+            mkdir($checksum_dir);
+            $data['last_expired'] = time(); /* currently count is used rather
+                than time, but we store time anyway.
+                */
+        }
+        $cache_file = "$checksum_dir/".webencode($key);
+        if (!file_exists($cache_file)) {
+            $data['count']++;
+        }
+        file_put_contents("$checksum_dir/last_expired.txt",
+            serialize($data));
+        file_put_contents($cache_file, serialize($value));
+    }
+    /**
+     * Makes a 0 - self::NUMBER_OF_BINS value out of the provided key
+     *
+     * @param string $key to convert to a random value between
+     *     0 - self::NUMBER_OF_BINS
+     * @return int value between 0 and self::NUMBER_OF_BINS
+     */
+    public function checksum($key)
+    {
+        $len = strlen($key);
+        $value = 0;
+        for ($i = 0; $i < $len; $i++) {
+            $value += ord($key[$i]);
+        }
+        return ($value % self::NUMBER_OF_BINS);
+    }
+}
--- a/src/library/HashTable.php
+++ b/src/library/HashTable.php
@ -0,0 +1,294 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ *
+ * Code used to manage a memory efficient hash table
+ * Weights for the queue must be flaots
+ *
+ * @author Chris Pollett
+ */
+class HashTable extends StringArray
+{
+    /**
+     * The size in bytes for keys stored in the hash table
+     *
+     * @var int
+     */
+    public $key_size;
+    /**
+     * The size in bytes of values associated with values
+     *
+     * @var int
+     */
+    public $value_size;
+    /**
+     * Holds an all \0 string used of length $this->key_size
+     * @var string
+     */
+    public $null;
+    /**
+     * Holds \0\0 followed by an all \FF string of length $this->key_size -1
+     * Used to indicate that a slot once held data but that data was deleted.
+     * Such a slot tells a lookup to keep going, but on an insert can be
+     * overwritten in the inserted key is not already in the table
+     * @var string
+     */
+    public $deleted;
+    /**
+     * Number of items currently in the hash table
+     * @var int
+     */
+    public $count;
+    /**
+     * Flag for hash table lookup methods
+     */
+    const ALWAYS_RETURN_PROBE = 1;
+    /**
+     * Flag for hash table lookup methods
+     */
+    const RETURN_PROBE_ON_KEY_FOUND = 0;
+    /**
+     * Flag for hash table lookup methods
+     */
+    const RETURN_VALUE = -1;
+    /**
+     * Flag for hash table lookup methods
+     */
+    const RETURN_BOTH = -2;
+    /**
+     * Makes a persistently stored (i.e., on disk and ram)  hash table using the
+     * supplied parameters
+     *
+     * @param string $fname filename to use when storing the hash table to disk
+     * @param int $num_values number of key value pairs the table can hold
+     * @param int $key_size number of bytes to store a hash table key
+     * @param int $value_size number of bytes to store a hash table value
+     * @param int $save_frequency how many non read operation before saving to
+     *     disk
+     */
+    public function __construct($fname, $num_values, $key_size, $value_size,
+        $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
+    {
+        $this->key_size = $key_size;
+        $this->value_size = $value_size;
+        $this->null = pack("x". $this->key_size);
+        $this->deleted = pack("H2x".($this->key_size - 1), "FF");
+        $this->count = 0;
+        parent::__construct($fname, $num_values,
+            $key_size + $value_size, $save_frequency);
+    }
+    /**
+     * Inserts the provided $key - $value pair into the hash table
+     *
+     * @param string $key the key to use for the insert (will be needed for
+     *     lookup)
+     * @param string $value the value associated with $key
+     * @param int $probe if the location in the hash table is already known
+     *     to be $probe then this variable can be used to save a lookup
+     * @return bool whether the insert was successful or not
+     */
+    public function insert($key, $value, $probe = false)
+    {
+        $null = $this->null;
+        $deleted = $this->deleted;
+
+        if ($probe === false) {
+            $probe = $this->lookup($key, self::ALWAYS_RETURN_PROBE);
+        }
+        if ($probe === false) {
+            /* this is a little slow
+               the idea is we can't use deleted slots until we are sure
+               $key isn't in the table
+             */
+            $probe = $this->lookupArray(
+                $key, [$null, $deleted], self::ALWAYS_RETURN_PROBE);
+            if ($probe === false) {
+                crawlLog("No space in hash table");
+                return false;
+            }
+        }
+        //there was a free slot so write entry...
+        $data = pack("x". ($this->key_size + $this->value_size));
+        if (strlen($value) < $this->value_size) {
+            /* this case should not happen, rather
+                give an error we null terminate the string to the desired
+                length
+             */
+            $value = str_pad($value, $this->value_size, '\0');
+        }
+        //first the key
+        for ($i = 0; $i < $this->key_size; $i++) {
+            $data[$i] = $key[$i];
+        }
+        //then the value
+        for ($i = 0; $i < $this->value_size; $i++) {
+            $data[$i + $this->key_size] = $value[$i];
+        }
+        $this->put($probe, $data);
+        $this->count++;
+        $this->checkSave();
+        return true;
+    }
+    /**
+     * Tries to lookup the key in the hash table either return the
+     * location where it was found or the value associated with the key.
+     *
+     * @param string $key key to look up in the hash table
+     * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE,
+     *     self::RETURN_PROBE_ON_KEY_FOUND, self::RETURN_VALUE, or self::BOTH.
+     *     Here value means the value associated with the key and probe is
+     *     either the location in the array where the key was found or
+     *     the first location in the array where it was determined the
+     *     key could not be found.
+     * @return mixed would be string if the value is being returned,
+     *     an int if the probe is being returned, and false if the key
+     *     is not found
+     */
+    public function lookup($key, $return_probe_value = self::RETURN_VALUE)
+    {
+        return $this->lookupArray(
+            $key, [$this->null], $return_probe_value);
+    }
+    /**
+     * Tries to lookup the key in the hash table either return the
+     * location where it was found or the value associated with the key.
+     * If the key is not at the initial probe value, linear search in the
+     * table is done. The values which cut-off the search are stored in
+     * $null_array. Using an array allows for flexibility since a deleted
+     * entry needs to be handled different when doing a lookup then when
+     * doing an insert.
+     *
+     * @param string $key key to look up in the hash table
+     * @param array $null_array key values that would cut-off the search
+     *     for key if the initial probe failed
+     * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE,
+     *     self::RETURN_PROBE_ON_KEY_FOUND, or self::RETURN_VALUE. Here
+     *     value means the value associated with the key and probe is
+     *     either the location in the array where the key was found or
+     *     the first location in the array where it was determined the
+     *     key could not be found.
+     * @return mixed would be string if the value is being returned,
+     *     an int if the probe is being returned, and false if the key
+     *     is not found
+     */
+    public function lookupArray($key, $null_array,
+        $return_probe_value = self::RETURN_VALUE)
+    {
+        $index = $this->hash($key);
+        $num_values = $this->num_values;
+        $probe_array = [self::RETURN_PROBE_ON_KEY_FOUND,
+            self::ALWAYS_RETURN_PROBE];
+        for ($j = 0; $j < $num_values; $j++)  {
+            $probe = ($index + $j) % $num_values;
+            list($index_key, $index_value) = $this->getEntry($probe);
+            if (in_array($index_key, $null_array)) {
+                if ($return_probe_value == self::ALWAYS_RETURN_PROBE) {
+                    return $probe;
+                } else {
+                    return false;
+                }
+            }
+
+            if (strcmp($key, $index_key) == 0) { break; }
+        }
+
+        if ($j == $num_values) {return false;}
+
+        $result = $index_value;
+        if (in_array($return_probe_value, $probe_array)) {
+            $result = $probe;
+        }
+        if ($return_probe_value == self::RETURN_BOTH) {
+            $result = [$probe, $index_value];
+        }
+        return $result;
+    }
+    /**
+     * Deletes the data associated with the provided key from the hash table
+     *
+     * @param string $key the key to delete the entry for
+     * @param int $probe if the location in the hash table is already known
+     *     to be $probe then this variable can be used to save a lookup
+     * @return bool whether or not something was deleted
+     */
+    public function delete($key, $probe = false)
+    {
+        $deleted = pack("H2x".($this->key_size + $this->value_size - 1), "FF");
+            //deletes
+        if ($probe === false) {
+            $probe = $this->lookup($key, self::RETURN_PROBE_ON_KEY_FOUND);
+        }
+        if ($probe === false) { return false; }
+        $this->put($probe, $deleted);
+        $this->count--;
+        $this->checkSave();
+        return true;
+    }
+    /**
+     * Get the ith entry of the array for the hash table (no hashing here)
+     *
+     * @param int $i an index of the hash table array
+     * @return array the key value pair stored at this index
+     */
+    public function getEntry($i)
+    {
+        $raw = $this->get($i);
+        $key = substr($raw, 0, $this->key_size);
+        $value = substr($raw, $this->key_size, $this->value_size);
+        return [$key, $value];
+    }
+    /**
+     * Hashes the provided key to an index in the array of the hash table
+     *
+     * @param string $key a key to hashed into the hash table
+     * @return int an index in the array of the hash table
+     */
+    public function hash($key)
+    {
+        $tmp = md5($key, true);
+        $pre_index = ((ord($tmp[0]) << 8) + ord($tmp[1]) << 8) + ord($tmp[2]);
+        $index = floor($pre_index * $this->num_values/(2 << 23));
+        return $index;
+    }
+    /**
+     * Pretty prints the contents of the hash table viewed as an array.
+     *
+     */
+    public function printContents()
+    {
+        for ($i = 1; $i <= $this->num_values; $i++) {
+            $row = $this->getEntry($i);
+            print "Entry: $i Key:".$row[0]." Value: ".$row[1]."\n";
+        }
+    }
+}
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@ -0,0 +1,473 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * Used for crawlLog and crawlHash
+ */
+require_once __DIR__.'/Utility.php';
+/**
+ * Encapsulates a set of web page summaries and an inverted word-index of terms
+ * from these summaries which allow one to search for summaries containing a
+ * particular word.
+ *
+ * The basic file structures for an IndexArchiveBundle are:
+ * <ol>
+ * <li>A WebArchiveBundle for web page summaries.</li>
+ * <li>A IndexDictionary containing all the words stored in the bundle.
+ * Each word entry in the dictionary contains starting and ending
+ * offsets for documents containing that word for some particular IndexShard
+ * generation.</li>
+ * <li>A set of index shard generations. These generations
+ * have names index0, index1,... A shard has word entries, word doc entries
+ * and document entries. For more information see the index shard
+ * documentation.
+ * </li>
+ * <li>
+ * The file generations.txt keeps track of what is the current generation.
+ * A given generation can hold NUM_WORDS_PER_GENERATION words amongst all
+ * its partitions. After which the next generation begins.
+ * </li>
+ * </ol>
+ *
+ *
+ * @author Chris Pollett
+ */
+class IndexArchiveBundle implements CrawlConstants
+{
+    /**
+     * Folder name to use for this IndexArchiveBundle
+     * @var string
+     */
+    public $dir_name;
+    /**
+     * A short text name for this IndexArchiveBundle
+     * @var string
+     */
+    public $description;
+    /**
+     * Number of partitions in the summaries WebArchiveBundle
+     * @var int
+     */
+    public $num_partitions_summaries;
+    /**
+     * structure contains info about the current generation:
+     * its index (ACTIVE), and the number of words it contains
+     * (NUM_WORDS).
+     * @var array
+     */
+    public $generation_info;
+    /**
+     * Number of docs before a new generation is started
+     * @var int
+     */
+    public $num_docs_per_generation;
+    /**
+     * WebArchiveBundle for web page summaries
+     * @var object
+     */
+    public $summaries;
+    /**
+     * IndexDictionary for all shards in the IndexArchiveBundle
+     * This contains entries of the form (word, num_shards with word,
+     * posting list info 0th shard containing the word,
+     * posting list info 1st shard containing the word, ...)
+     * @var object
+     */
+    public $dictionary;
+    /**
+     * Index Shard for current generation inverted word index
+     * @var object
+     */
+    public $current_shard;
+    /**
+     * What version of index archive bundle this is
+     * @var int
+     */
+    public $version;
+    /**
+     * Threshold hold beyond which we don't load old index shard when
+     * restarting and instead just advance to a new shard
+     */
+    const NO_LOAD_SIZE = 50000000;
+    /**
+     * Makes or initializes an IndexArchiveBundle with the provided parameters
+     *
+     * @param string $dir_name folder name to store this bundle
+     * @param bool $read_only_archive whether to open archive only for reading
+     *  or reading and writing
+     * @param string $description a text name/serialized info about this
+     *      IndexArchiveBundle
+     * @param int $num_docs_per_generation the number of pages to be stored
+     *      in a single shard
+     */
+    public function __construct($dir_name, $read_only_archive = true,
+        $description = null, $num_docs_per_generation =
+        C\NUM_DOCS_PER_GENERATION)
+    {
+        $this->dir_name = $dir_name;
+        $index_archive_exists = false;
+        $is_dir = is_dir($this->dir_name);
+        if (!$is_dir && !$read_only_archive) {
+            mkdir($this->dir_name);
+            mkdir($this->dir_name."/posting_doc_shards");
+        } else if (!$is_dir) {
+            return false;
+        } else {
+            $index_archive_exists = true;
+        }
+        if (file_exists($this->dir_name."/generation.txt")) {
+            $this->generation_info = unserialize(
+                file_get_contents($this->dir_name."/generation.txt"));
+        } else if (!$read_only_archive) {
+            $this->generation_info['ACTIVE'] = 0;
+            file_put_contents($this->dir_name."/generation.txt",
+                serialize($this->generation_info));
+        }
+        $this->summaries = new WebArchiveBundle($dir_name."/summaries",
+            $read_only_archive, -1, $description);
+        if (!$read_only_archive) {
+            $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT");
+        }
+        $this->description = $this->summaries->description;
+        if (isset($this->summaries->version)) {
+            $this->version = $this->summaries->version;
+        }
+        $this->num_docs_per_generation = $num_docs_per_generation;
+        $this->dictionary = new IndexDictionary($this->dir_name."/dictionary",
+            $this);
+    }
+    /**
+     * Add the array of $pages to the summaries WebArchiveBundle pages being
+     * stored in the partition $generation and the field used
+     * to store the resulting offsets given by $offset_field.
+     *
+     * @param int $generation field used to select partition
+     * @param string $offset_field field used to record offsets after storing
+     * @param array& $pages data to store
+     * @param int $visited_urls_count number to add to the count of visited urls
+     *     (visited urls is a smaller number than the total count of objects
+     *     stored in the index).
+     */
+    public function addPages($generation, $offset_field, &$pages,
+        $visited_urls_count)
+    {
+        $this->summaries->setWritePartition($generation);
+        $this->summaries->addPages($offset_field, $pages);
+        $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT");
+    }
+    /**
+     * Adds the provided mini inverted index data to the IndexArchiveBundle
+     * Expects initGenerationToAdd to be called before, so generation is correct
+     *
+     * @param object $index_shard a mini inverted index of word_key=>doc data
+     *     to add to this IndexArchiveBundle
+     */
+    public function addIndexData($index_shard)
+    {
+        crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
+        $start_time = microtime(true);
+        $this->getActiveShard()->appendIndexShard($index_shard);
+        crawlLog("Append Index Shard: Memory usage:".memory_get_usage() .
+          " Time: ".(changeInMicrotime($start_time)));
+    }
+    /**
+     * Determines based on its size, if index_shard should be added to
+     * the active generation or in a new generation should be started.
+     * If so, a new generation is started, the old generation is saved, and
+     * the dictionary of the old shard is copied to the bundles dictionary
+     * and a log-merge performed if needed
+     *
+     * @param int $add_num_docs number of docs in the shard about to be added
+     * @param object $callback object with join function to be
+     *     called if process is taking too long
+     * @param bool $blocking whether there is an ongoing merge tiers operation
+     *      occurring, if so don't do anything and return -1
+     * @return int the active generation after the check and possible change has
+     *     been performed
+     */
+    public function initGenerationToAdd($add_num_docs, $callback = null,
+        $blocking = false)
+    {
+        $current_num_docs = $this->getActiveShard()->num_docs;
+        crawlLog("Current index shard has ".$current_num_docs." documents.");
+        $memory_limit = metricToInt(ini_get("memory_limit"));
+        crawlLog("Memory Indexer limit is ".$memory_limit.". Usage is ".
+            memory_get_usage());
+        if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation
+            || (0.55 * $memory_limit) < memory_get_usage() ) {
+            if ($blocking == true) {
+                return -1;
+            }
+            crawlLog("Switching Index Shard...");
+            $switch_time = microtime(true);
+            // Save current shard dictionary to main dictionary
+            $this->forceSave();
+            $this->addAdvanceGeneration($callback);
+            crawlLog("Switch Index Shard time:".
+                changeInMicrotime($switch_time));
+        }
+        return $this->generation_info['ACTIVE'];
+    }
+    /**
+     * Starts a new generation,  the dictionary of the old shard is copied to
+     * the bundles dictionary and a log-merge performed if needed. This
+     * function may be called by initGenerationToAdd as well as when resuming
+     * a crawl rather than loading the periodic index of save of a too large
+     * shard.
+     *
+     * @param object $callback object with join function to be
+     *     called if process is taking too long
+     */
+    public function addAdvanceGeneration($callback = null)
+    {
+        $this->addCurrentShardDictionary($callback);
+        //Set up new shard
+        $this->generation_info['ACTIVE']++;
+        $this->generation_info['CURRENT'] =
+            $this->generation_info['ACTIVE'];
+        $current_index_shard_file = $this->dir_name.
+            "/posting_doc_shards/index". $this->generation_info['ACTIVE'];
+        $this->current_shard = new IndexShard(
+            $current_index_shard_file, $this->generation_info['ACTIVE'],
+                $this->num_docs_per_generation);
+        file_put_contents($this->dir_name."/generation.txt",
+            serialize($this->generation_info));
+    }
+    /**
+     * Adds the words from this shard to the dictionary
+     * @param object $callback object with join function to be
+     *     called if process is taking too  long
+     */
+    public function addCurrentShardDictionary($callback = null)
+    {
+        $current_index_shard_file = $this->dir_name.
+            "/posting_doc_shards/index". $this->generation_info['ACTIVE'];
+        /* want to do the copying of dictionary as files to conserve memory
+           in case merge tiers after adding to dictionary
+        */
+        $this->current_shard = new IndexShard(
+            $current_index_shard_file, $this->generation_info['ACTIVE'],
+                $this->num_docs_per_generation, true);
+        $this->dictionary->addShardDictionary($this->current_shard, $callback);
+    }
+    /**
+     * Sets the current shard to be the active shard (the active shard is
+     * what we call the last (highest indexed) shard in the bundle. Then
+     * returns a reference to this shard
+     * @return object last shard in the bundle
+     */
+     public function getActiveShard()
+     {
+        if ($this->setCurrentShard($this->generation_info['ACTIVE'])) {
+            return $this->getCurrentShard();
+        } else if (!isset($this->current_shard) ) {
+            $current_index_shard_file = $this->dir_name.
+                "/posting_doc_shards/index". $this->generation_info['CURRENT'];
+            $this->current_shard = new IndexShard($current_index_shard_file,
+                $this->generation_info['CURRENT'],
+                $this->num_docs_per_generation);
+        }
+        return $this->current_shard;
+     }
+    /**
+     * Returns the shard which is currently being used to read word-document
+     * data from the bundle. If one wants to write data to the bundle use
+     * getActiveShard() instead. The point of this method is to allow
+     * for lazy reading of the file associated with the shard.
+     *
+     * @param bool $force_read whether to force no advance generation and
+     *      merge dictionary side effects
+     * @return object the currently being index shard
+     */
+     public function getCurrentShard($force_read = false)
+     {
+        if (!isset($this->current_shard)) {
+            if (!isset($this->generation_info['CURRENT'])) {
+                $this->generation_info['CURRENT'] =
+                    $this->generation_info['ACTIVE'];
+            }
+            $current_index_shard_file = $this->dir_name .
+                "/posting_doc_shards/index". $this->generation_info['CURRENT'];
+            if (file_exists($current_index_shard_file)) {
+                if (isset($this->generation_info['DISK_BASED']) &&
+                    $this->generation_info['DISK_BASED'] == true) {
+                    $this->current_shard = new IndexShard(
+                        $current_index_shard_file,
+                        $this->generation_info['CURRENT'],
+                        $this->num_docs_per_generation, true);
+                    $this->current_shard->getShardHeader();
+                    $this->current_shard->read_only_from_disk = true;
+                } else {
+                    if (!$force_read && filesize($current_index_shard_file) >
+                        self::NO_LOAD_SIZE) {
+                        $this->addAdvanceGeneration();
+                    } else {
+                        $this->current_shard =
+                            IndexShard::load($current_index_shard_file);
+                    }
+                }
+            } else {
+                $this->current_shard = new IndexShard($current_index_shard_file,
+                    $this->generation_info['CURRENT'],
+                    $this->num_docs_per_generation);
+            }
+        }
+        return $this->current_shard;
+     }
+    /**
+     * Sets the current shard to be the $i th shard in the index bundle.
+     *
+     * @param $i which shard to set the current shard to be
+     * @param $disk_based whether to read the whole shard in before using or
+     *     leave it on disk except for pages need and use memcache
+     */
+     public function setCurrentShard($i, $disk_based = false)
+     {
+        $this->generation_info['DISK_BASED'] = $disk_based;
+        if (isset($this->generation_info['CURRENT']) &&
+            isset($this->generation_info['ACTIVE']) &&
+            ($i == $this->generation_info['CURRENT'] ||
+            $i > $this->generation_info['ACTIVE'])) {
+            return false;
+        } else {
+            $this->generation_info['CURRENT'] = $i;
+            unset($this->current_shard);
+            return true;
+        }
+     }
+    /**
+     * Gets the page out of the summaries WebArchiveBundle with the given
+     * offset and generation
+     *
+     * @param int $offset byte offset in partition of desired page
+     * @param int $generation which generation WebArchive to look up in
+     *     defaults to the same number as the current shard
+     * @return array desired page
+     */
+    public function getPage($offset, $generation = -1)
+    {
+        if ($generation == -1 ) {
+            $generation = $this->generation_info['CURRENT'];
+        }
+        return $this->summaries->getPage($offset, $generation);
+    }
+    /**
+     * Forces the current shard to be saved
+     */
+    public function forceSave()
+    {
+        $this->getActiveShard()->save(false, true);
+    }
+    /**
+     * Computes the number of occurrences of each of the supplied list of
+     * word_keys
+     *
+     * @param array $word_keys keys to compute counts for
+     * @return array associative array of key => count values.
+     */
+    public function countWordKeys($word_keys)
+    {
+        $words_array = [];
+        if (!is_array($word_keys) || count($word_keys) < 1) { return null;}
+        foreach ($word_keys as $word_key) {
+            $tmp = $this->dictionary->getWordInfo($word_key);
+            if ($tmp === false) {
+                $words_array[$word_key] = 0;
+            } else {
+                $count = 0;
+                foreach ($tmp as $entry) {
+                    $count += $entry[3];
+                }
+                $words_array[$word_key] = $count;
+            }
+        }
+        return $words_array;
+    }
+    /**
+     * Gets the description, count of summaries, and number of partitions of the
+     * summaries store in the supplied directory. If the file
+     * arc_description.txt exists, this is viewed as a dummy index archive for
+     * the sole purpose of allowing conversions of downloaded data such as arc
+     * files into Yioop! format.
+     *
+     * @param string $dir_name path to a directory containing a summaries
+     *      WebArchiveBundle
+     * @return array summary of the given archive
+     */
+    public static function getArchiveInfo($dir_name)
+    {
+        if (file_exists($dir_name."/arc_description.txt")) {
+            $crawl = [];
+            $info = [];
+            $crawl['DESCRIPTION'] = substr(
+                file_get_contents($dir_name."/arc_description.txt"), 0, 256);
+            $crawl['ARCFILE'] = true;
+            $info['VISITED_URLS_COUNT'] = 0;
+            $info['COUNT'] = 0;
+            $info['NUM_DOCS_PER_PARTITION'] = 0;
+            $info['WRITE_PARTITION'] = 0;
+            $info['DESCRIPTION'] = serialize($crawl);
+            return $info;
+        }
+        if (file_exists($dir_name . "/description.txt")) {
+            $info = WebArchiveBundle::getArchiveInfo($dir_name);
+            if (isset($info['DESCRIPTION'])) {
+                return $info;
+            }
+        }
+        return WebArchiveBundle::getArchiveInfo($dir_name."/summaries");
+    }
+    /**
+     * Sets the archive info (DESCRIPTION, COUNT,
+     * NUM_DOCS_PER_PARTITION) for the web archive bundle associated with
+     * this bundle. As DESCRIPTION is used to store info about the info
+     * bundle this sets the global properties of the info bundle as well.
+     *
+     * @param string $dir_name folder with archive bundle
+     * @param array $info struct with above fields
+     */
+    public static function setArchiveInfo($dir_name, $info)
+    {
+        WebArchiveBundle::setArchiveInfo($dir_name."/summaries", $info);
+    }
+    /**
+     * Returns the mast time the archive info of the bundle was modified.
+     *
+     * @param string $dir_name folder with archive bundle
+     */
+    public static function getParamModifiedTime($dir_name)
+    {
+        return WebArchiveBundle::getParamModifiedTime($dir_name."/summaries");
+    }
+}
--- a/src/library/IndexDictionary.php
+++ b/src/library/IndexDictionary.php
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@ -0,0 +1,309 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * For crawlHash
+ */
+require_once __DIR__."/Utility.php";
+/**
+ * Class used to manage open IndexArchiveBundle's while performing
+ * a query. Ensures an easy place to obtain references to these bundles
+ * and ensures only one object per bundle is instantiated in a Singleton-esque
+ * way.
+ *
+ * @author Chris Pollett
+ */
+class IndexManager implements CrawlConstants
+{
+    /**
+     * Open IndexArchiveBundle's managed by this manager
+     * @var array
+     */
+    public static $indexes = [];
+    /**
+     * Used to cache word lookup of posting list locations for a given
+     * index
+     * @var array
+     */
+    public static $dictionary = [];
+    /**
+     * Returns a reference to the managed copy of an IndexArchiveBundle object
+     * with a given timestamp or an IndexShard in the case where
+     * $index_name == "feed" (for handling media feeds)
+     *
+     * @param string $index_name timestamp of desired IndexArchiveBundle
+     * @return object the desired IndexArchiveBundle reference
+     */
+    public static function getIndex($index_name)
+    {
+        $index_name = trim($index_name); //trim to fix postgres quirkiness
+        if (!isset(self::$indexes[$index_name])) {
+            if ($index_name == "feed") {
+                $index_file = C\WORK_DIRECTORY."/feeds/index";
+                if (file_exists($index_file)) {
+                    self::$indexes[$index_name] = new IndexShard(
+                        $index_file, 0, C\NUM_DOCS_PER_GENERATION, true);
+                } else {
+                    return false;
+                }
+            } else {
+                $index_archive_name = self::index_data_base_name . $index_name;
+                $tmp = new IndexArchiveBundle(
+                    C\CRAWL_DIR.'/cache/'.$index_archive_name);
+                if (!$tmp) {
+                    return false;
+                }
+                self::$indexes[$index_name] = $tmp;
+                self::$indexes[$index_name]->setCurrentShard(0, true);
+            }
+        }
+        return self::$indexes[$index_name];
+    }
+    /**
+     * Returns the version of the index, so that Yioop can determine
+     * how to do word lookup.The only major change to the format was
+     * when word_id's went from 8 to 20 bytes which happened around Unix
+     * time 1369754208.
+     *
+     * @param string $index_name unix timestamp of index
+     * @return int 0 - if the orginal format for Yioop indexes; 1 -if 20 byte
+     *     word_id format
+     */
+    public static function getVersion($index_name)
+    {
+        if (intval($index_name) < C\VERSION_0_TIMESTAMP) {
+            return 0;
+        }
+        $tmp_index = self::getIndex($index_name);
+        if (isset($tmp_index->version)) {
+            return $tmp_index->version;
+        }
+        return 1;
+    }
+    /**
+     * Gets an array posting list positions for each shard in the
+     * bundle $index_name for the word id $hash
+     *
+     * @param string $index_name bundle to look $hash in
+     * @param string $hash hash of phrasse or word to look up in bundle
+     *     dictionary
+     * @param int $shift if $hash is for a phrase, how many low order
+     *     bits of word id to discard
+     * @param string $mask if $hash is for a word, after the 9th byte what
+     *     meta word mask should be applied to the 20 byte hash
+     * @param int $threshold after the number of results exceeds this amount
+     *     stop looking for more dictionary entries.
+     * @param int $start_generation
+     * @param int $num_distinct_generations
+     * @param bool $with_remaining_total
+     * @return array sequence of four tuples:
+     *     (index_shard generation, posting_list_offset, length, exact id
+     *      that match $hash)
+     */
+    public static function getWordInfo($index_name, $hash, $shift = 0,
+        $mask = "", $threshold = -1, $start_generation = -1,
+        $num_distinct_generations = -1, $with_remaining_total = false)
+    {
+        $id = "$index_name:$start_generation:$num_distinct_generations";
+        $index = IndexManager::getIndex($index_name);
+        if (!$index->dictionary) {
+            $tmp = [];
+            if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS)
+               && file_exists(C\WORK_DIRECTORY."/feeds/index")) {
+               //NO_FEEDS defined true in statistic_controller.php
+                $use_feeds = true;
+                $feed_shard = IndexManager::getIndex("feed");
+                $feed_info = $feed_shard->getWordInfo($hash, true, $shift,
+                    $mask);
+                if (is_array($feed_info)) {
+                    $tmp[-1] = [-1, $feed_info[0],
+                        $feed_info[1], $feed_info[2], $feed_info[3]];
+                }
+            }
+            if ($tmp == []) {
+                return ($with_remaining_total) ? [0, false] : false;
+            }
+            IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold] =
+                [$feed_info[3], $tmp];
+            return ($with_remaining_total) ?
+                IndexManager::$dictionary[$id][$hash][$shift][$mask][
+                    $threshold] :
+                IndexManager::$dictionary[$id][$hash][$shift][$mask][
+                    $threshold][1];
+        }
+        $len = strlen($mask);
+        if ($len > 0) {
+            $pre_hash = substr($hash, 0, 8) .
+                "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
+        } else {
+            $pre_hash = $hash;
+        }
+        if (!isset(IndexManager::$dictionary[$id][$hash][$shift][$mask][
+            $threshold])) {
+            $tmp = [];
+            $test_mask = "";
+            if (isset(IndexManager::$dictionary[$id][$pre_hash][
+                $shift])) {
+                foreach (IndexManager::$dictionary[$id][$pre_hash][
+                    $shift] as $test_mask => $data) {
+                    $mask_len = strlen($test_mask);
+                    if ($mask_len > $len) {continue; }
+                    $mask_found = true;
+                    for ($k = 0; $k < $mask_len; $k++) {
+                        if (ord($test_mask[$k]) > 0 &&
+                            $test_mask[$k] != $mask[$k]) {
+                            $mask_found = false;
+                            break;
+                        }
+                    }
+                    if ($mask_found && isset(
+                        IndexManager::$dictionary[$id][$pre_hash][
+                            $shift][$test_mask][$threshold]) ) {
+                        list($total, $info) =
+                            IndexManager::$dictionary[$id][$pre_hash
+                            ][$shift][$test_mask][$threshold];
+                        $out_info = [];
+                        foreach ($info as $record) {
+                            $rid = $record[4];
+                            $add_flag = true;
+                            if ($mask != "") {
+                               for ($k = 0; $k < $len; $k++) {
+                                    $loc = 8 + $k;
+                                    if (ord($mask[$k]) > 0 &&
+                                        isset($rid[$loc]) &&
+                                        $rid[$loc] != $hash[$loc]) {
+                                        $add_flag = false;
+                                        break;
+                                    }
+                                }
+                            }
+                            if ($add_flag) {
+                                $out_info[$record[0]] = $record;
+                            } else {
+                                if ($record[3] < $total) {
+                                    $total -= $record[3];
+                                }
+                            }
+                        }
+                        IndexManager::$dictionary[$id][$hash][$shift
+                           ][$mask] = [$total, $out_info];
+                        return ($with_remaining_total) ?
+                            IndexManager::$dictionary[$id][
+                            $hash][$shift][$mask] :
+                            IndexManager::$dictionary[$id][
+                            $hash][$shift][$mask][1];
+                    }
+                }
+            }
+            if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) &&
+                $start_generation < 0
+                && file_exists(C\WORK_DIRECTORY."/feeds/index")) {
+                //NO_FEEDS defined true in statistic_controller.php
+                $use_feeds = true;
+                $feed_shard = IndexManager::getIndex("feed");
+                $feed_info = $feed_shard->getWordInfo($hash, true, $shift,
+                    $mask);
+                if (is_array($feed_info)) {
+                    $tmp[-1] = [-1, $feed_info[0],
+                        $feed_info[1], $feed_info[2], $feed_info[3]];
+                }
+            }
+            $pre_info =
+                $index->dictionary->getWordInfo($hash, true, $shift, $mask,
+                $threshold, $start_generation, $num_distinct_generations, true);
+            if (isset($pre_info[1])) {
+                list($total, $info) = $pre_info;
+            } else {
+                $total = 0;
+                $info = [];
+            }
+            if (isset($tmp[-1][3])) {
+                $total += $tmp[-1][3];
+                $info = $tmp + $info;
+            }
+            IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold] =
+                [$total, $info];
+        }
+        return ($with_remaining_total) ?
+            IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold]:
+            IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold][1];
+    }
+    /**
+     * Returns the number of document that a given term or phrase appears in
+     * in the given index
+     *
+     * @param string $term_or_phrase what to look up in the indexes dictionary
+     *     no  mask is used for this look up
+     * @param string $index_name index to look up term or phrase in
+     * @param int $threshold if set and positive then once threshold many
+     *     documents are found the search for more documents to add to the
+     *     total is stoppe
+     * @param int $start_generation
+     * @param int $num_distinct_generations
+     * @return int number of documents
+     */
+    public static function numDocsTerm($term_or_phrase, $index_name,
+        $threshold = -1, $start_generation = -1,
+        $num_distinct_generations = C\NUM_DISTINCT_GENERATIONS)
+    {
+        $index = IndexManager::getIndex($index_name);
+        if (!$index->dictionary) {
+            return false;
+        }
+        $pos = -1;
+        $total_num_docs = 0;
+        $hashes = allCrawlHashPaths($term_or_phrase, [], [], true);
+        if (!is_array($hashes)) {
+            $hashes = [$hashes];
+        }
+        foreach ($hashes as $hash) {
+            if (is_array($hash)) {
+                list($num_docs, ) =
+                    IndexManager::getWordInfo($index_name, $hash[0],
+                        $hash[1], $hash[2], $threshold, $start_generation,
+                        $num_distinct_generations, true);
+            } else {
+                list($num_docs, ) =
+                    IndexManager::getWordInfo($index_name, $hash, 0, "",
+                    $threshold, $start_generation, $num_distinct_generations,
+                    true);
+            }
+            $total_num_docs += $num_docs;
+            if ($threshold > 0 && $total_num_docs > $threshold) {
+                   return $total_num_docs;
+            }
+        }
+        return $total_num_docs;
+    }
+}
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
--- a/src/library/JavascriptUnitTest.php
+++ b/src/library/JavascriptUnitTest.php
@ -0,0 +1,53 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * Super class of all the test classes testing Javascript functions.
+ *
+ * @author Akash Patel
+ */
+class JavascriptUnitTest extends UnitTest
+{
+    /**
+     * {@inheritDocs}
+     */
+    public function setUp()
+    {
+    }
+    /**
+     * {@inheritDocs}
+     */
+    public function tearDown()
+    {
+    }
+}
+
--- a/src/library/Join.php
+++ b/src/library/Join.php
@ -0,0 +1,52 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * Marker interface used to say that a class has supports a join()
+ * callback method. IndexArchiveBundle has methods which take objects
+ * that implement Join. For activities which may take a long time
+ * such as index saving index tier merging IndexArchiveBundle will
+ * periodically call the Join objects join method so that it can continue
+ * processing rather than blocking entirely until the long running method
+ * completes
+ *
+ * @author Chris Pollett
+ * @see WebQueueBundle
+ */
+interface Join
+{
+    /**
+     * A callback function which will be invoked periodically by a method
+     * of another object that runs a long time.
+     */
+    public function join();
+}
--- a/src/library/LocaleFunctions.php
+++ b/src/library/LocaleFunctions.php
@ -0,0 +1,483 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * This file contains global functions connected to localization that
+ * are used throughout the web site part of Yioop!
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\models\LocaleModel;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Attempts to guess the user's locale based on the request, session,
+ * and user-agent data
+ *
+ * @return string IANA language tag of the guessed locale
+ */
+function guessLocale()
+{
+    /* the request variable l and the browser's HTTP_ACCEPT_LANGUAGE
+       are used to determine the locale */
+    if (isset($_SERVER['HTTP_ACCEPT_LANGUAGE'])) {
+        $l_parts = explode(",", $_SERVER['HTTP_ACCEPT_LANGUAGE']);
+        if (count($l_parts) > 0) {
+            $guess_l = $l_parts[0];
+        }
+        $guess_map = [
+            "cn" => "zh-CN",
+            "en" => "en-US",
+            "en-us" => "en-US",
+            "en-US" => "en-US",
+            "fr" => "fr-FR",
+            "ko" => "ko",
+            "in" => "in-ID",
+            "ja" => "ja",
+            "vi" => "vi-VN",
+            "vi-vn" => "vi-VN",
+            "vi-VN" => "vi-VN",
+            "zh" => "zh-CN",
+            "zh-CN" => "zh-CN",
+            "zh-cn" => "zh-CN",
+        ];
+        if (isset($guess_map[$guess_l])) {
+            $guess_l = $guess_map[$guess_l];
+        }
+    }
+    if (isset($_SESSION['l']) || isset($_REQUEST['l']) || isset($guess_l)) {
+        $l = (isset($_REQUEST['l'])) ? $_REQUEST['l'] :
+            ((isset($_SESSION['l'])) ? $_SESSION['l'] : $guess_l);
+        if (strlen($l) < 10) {
+            $l = addslashes($l);
+            if (is_dir(C\LOCALE_DIR . "/" . str_replace("-", "_", $l))) {
+                $locale_tag = $l;
+            }
+        }
+    }
+    if (!isset($locale_tag)) {
+        $locale_tag = C\DEFAULT_LOCALE;
+    }
+    return $locale_tag;
+}
+/**
+ * Attempts to guess the user's locale based on a string sample
+ *
+ * @param string $phrase_string used to make guess
+ * @param string $locale_tag language tag to use if can't guess -- if not
+ *     provided uses current locale's value
+ * @param int threshold number of chars to guess a particular encoding
+ * @return string IANA language tag of the guessed locale
+
+ */
+function guessLocaleFromString($phrase_string, $locale_tag = null)
+{
+    $original_phrase_string = $phrase_string;
+    $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag;
+    $sub = C\PUNCT."|[0-9]|\s";
+    $phrase_string = preg_replace('/'.$sub.'/u', "", $phrase_string);
+    $phrase_string = mb_convert_encoding($phrase_string, "UTF-32", "UTF-8");
+    $len = strlen($phrase_string);
+    $guess['zh-CN'] = 0;
+    $guess['ru'] = 0;
+    $guess['he'] = 0;
+    $guess['ar'] = 0;
+    $guess['th'] = 0;
+    $guess['ja'] = 0;
+    $guess['ko'] = 0;
+    $guess[$locale_tag] = 1;
+    for ($i = 0; $i < $len; $i += 4) {
+        $start = ord($phrase_string[$i+2]);
+        $next = ord($phrase_string[$i+3]);
+        if ($start >= 78 && $start <= 159) {
+            $guess['zh-CN'] += 4;
+        } else if ($start == 4 || ($start == 5 && $next < 48)) {
+            $guess['ru']++;
+        } else if ($start == 5 && $next >= 144) {
+            $guess['he'] += 2;
+        } else if ($start >= 6 && $start <= 7) {
+            if ($locale_tag == "fa") {
+                $guess[$locale_tag] +=2;
+            } else {
+                $guess['ar'] += 2;
+            }
+        } else if ($start == 14 && $next < 128) {
+            $guess['th'] += 2;
+        } else if ($start >= 48 && $start <= 49) {
+            $guess['ja'] += 3;
+        } else if ($start == 17 || $start >= 172 && $start < 215) {
+            $guess['ko'] += 2;
+        } else if ($start == 0 && $next < 128) {
+            $guess[$locale_tag]++; // assume ascii is from $locale_tag
+        }
+    }
+    $num_points = ($len / 4) - 1; //there will be a lead and tail space
+    $max = $guess[$locale_tag];
+    if ($num_points >= 0 ) {
+        foreach ($guess as $tag => $cnt) {
+            if ($cnt >= $num_points && $cnt > $max) {
+                $locale_tag = $tag;
+                $max = $cnt;
+                break;
+            }
+        }
+    }
+    if ($locale_tag == 'en-US') {
+        $locale_tag = checkQuery($original_phrase_string);
+    }
+    return $locale_tag;
+}
+/**
+ * Tries to find wether query belongs to a programming language
+ *
+ * @param string $query query entered by user
+ *
+ * @return string $lang programming language for the the query provided
+ */
+function checkQuery($query)
+{
+    $programming_language_map = ['java:' => 'java', 'python:' => 'py'];
+    $control_word = "/^(java:|python:)/";
+    $position = preg_match($control_word, trim($query),
+        $matches, PREG_OFFSET_CAPTURE);
+    if (isset($matches[0][0])) {
+        $matched_word = $matches[0][0];
+        if (isset($programming_language_map[$matched_word])) {
+            $lang = $programming_language_map[$matched_word];
+        } else {
+            $lang = 'en-US';
+        }
+    } else {
+        $lang = 'en-US';
+    }
+    return $lang;
+}
+/**
+ * Tries to guess at a language tag based on the name of a character
+ * encoding
+ *
+ * @param string $encoding a character encoding name
+ *
+ * @return string guessed language tag
+ */
+function guessLangEncoding($encoding)
+{
+    $lang = ["EUC-JP", "Shift_JIS", "JIS", "ISO-2022-JP"];
+    if (in_array($encoding, $lang)) {
+        return "ja";
+    }
+    $lang = ["EUC-CN", "GBK", "GB2312", "EUC-TW", "HZ", "CP936",
+        "BIG-5", "CP950"];
+    if (in_array($encoding, $lang)) {
+        return "zh-CN";
+    }
+    $lang = ["EUC-KR", "UHC", "CP949", "ISO-2022-KR"];
+    if (in_array($encoding, $lang)) {
+        return "ko";
+    }
+    $lang = ["Windows-1251", "CP1251", "CP866", "IBM866", "KOI8-R"];
+    if (in_array($encoding, $lang)) {
+        return "ru";
+    }
+    return 'en';
+}
+/**
+ * Tries to guess the encoding used for an Html document
+ *
+ * @param string $html a character encoding name
+ * @param string $return_loc_info if meta http-equiv info was used to
+ *     find the encoding, then if $return_loc_info is true, we
+ *     return the location of charset substring. This allows converting to
+ *     UTF-8 later so cached pages will display correctly and
+ *     redirects without char encoding won't be given a different hash.
+ *
+ * @return mixed either string or array if string then guessed encoding,
+ *     if array guessed encoding, start_pos of where charset info came from,
+ *     length
+ */
+function guessEncodingHtml($html, $return_loc_info = false)
+{
+     /*
+       If the doc is HTML and it uses a http-equiv to set the encoding
+       then we override what the server says (if anything). As we
+       are going to convert to UTF-8 we remove the charset info
+       from the meta tag so cached pages will display correctly and
+       redirects without char encoding won't be given a different hash.
+     */
+    $end_head = stripos($html, "</head");
+    if ($end_head) {
+        $reg = "/charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?/u";
+        $is_match = preg_match($reg, $html, $match);
+        if (!$is_match) {
+            $reg = "charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?";
+            mb_regex_encoding("UTF-8");
+            mb_ereg_search_init($html);
+            mb_ereg_search($reg);
+            $match = mb_ereg_search_getregs();
+            if (isset($match[0])) {
+                $is_match = true;
+            }
+        }
+        if ($is_match && isset($match[6])) {
+            $len_c = strlen($match[0]);
+            if (($match[6] == "'" || $match[6] == '"') &&
+               $match[3] != $match[6]) {
+                $len_c--;
+            }
+            $start_charset = strpos($html, $match[0]);
+            if ($start_charset + $len_c < $end_head) {
+                if (isset($match[4])) {
+                    $encoding = strtoupper($match[4]);
+                    if ($return_loc_info) {
+                        return [$encoding, $start_charset, $len_c];
+                    }
+                    return $encoding;
+                }
+            }
+        }
+    }
+    return mb_detect_encoding($html, 'auto');
+}
+
+/**
+ * Translate the supplied arguments into the current locale.
+ * This function takes a variable number of arguments. The first
+ * being an identifier to translate. Additional arguments
+ * are used to interpolate values in for %s's in the translation.
+ *
+ * @param string string_identifier  identifier to be translated
+ * @param mixed additional_args  used for interpolation in translated string
+ * @return string  translated string
+ */
+function tl()
+{
+    $locale = LocaleModel::$current_locale;
+    if (!is_object($locale)) {
+        return false;
+    }
+    $args = func_get_args();
+    $translation = $locale->translate($args);
+    if (!trim($translation)) {
+        $translation = $args[0];
+    }
+    return $translation;
+}
+/**
+ * Sets the language to be used for locale settings
+ *
+ * @param string $locale_tag the tag of the language to use to determine
+ *     locale settings
+ */
+function setLocaleObject($locale_tag)
+{
+    $locale_model = C\NS_MODELS . "LocaleModel";
+    $locale = new $locale_model();
+    $locale->initialize($locale_tag);
+    LocaleModel::$current_locale = $locale;
+}
+/**
+ * Gets the language tag (for instance, en_US for American English) of the
+ * locale that is currently being used. This function has the side
+ * effect of setting Yioop's current locale.
+ *
+ * @return string  the tag of the language currently being used for locale
+ *     settings
+ */
+function getLocaleTag()
+{
+    $locale = LocaleModel::$current_locale;
+    if (!$locale) {
+        $locale_tag = guessLocale();
+        setLocaleObject($locale_tag);
+        return $locale_tag;
+    }
+    return $locale->getLocaleTag();
+}
+/**
+ * Returns the current language directions.
+ *
+ * @return string ltr or rtl depending on if the language is left-to-right
+ * or right-to-left
+ */
+function getLocaleDirection()
+{
+    $locale = LocaleModel::$current_locale;
+    return $locale->getLocaleDirection();
+}
+/**
+ * Returns the query statistics info for the current llocalt.
+ *
+ * @return array consisting of queries and elapses times for locale computations
+ */
+function getLocaleQueryStatistics()
+{
+    $locale = LocaleModel::$current_locale;
+    $query_info = [];
+    $query_info['QUERY_LOG'] = $locale->db->query_log;
+    $query_info['TOTAL_ELAPSED_TIME'] = $locale->db->total_time;
+    return $query_info;
+}
+/**
+ * Returns the current locales method of writing blocks (things like divs or
+ * paragraphs).A language like English puts blocks one after another from the
+ * top of the page to the bottom. Other languages like classical Chinese list
+ * them from right to left.
+ *
+ * @return string  tb lr rl depending on the current locales block progression
+ */
+function getBlockProgression()
+{
+    $locale = LocaleModel::$current_locale;
+    return $locale->getBlockProgression();
+
+}
+/**
+ * Returns the writing mode of the current locale. This is a combination of the
+ * locale direction and the block progression. For instance, for English the
+ * writing mode is lr-tb (left-to-right top-to-bottom).
+ *
+ * @return string   the locales writing mode
+ */
+function getWritingMode()
+{
+    $locale = LocaleModel::$current_locale;
+    return $locale->getWritingMode();
+
+}
+/**
+ * Convert the string $str encoded in Windows-1256 into UTF-8
+ *
+ * @param string $str Windows-1256 string to convert
+ * @return string the UTF-8 equivalent
+ */
+function w1256ToUTF8($str)
+{
+    static $conv = [
+        0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
+        0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, 0x0010, 0x0011,
+        0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001A,
+        0x001B, 0x001C, 0x001D, 0x001E, 0x001F, 0x0020, 0x0021, 0x0022, 0x0023,
+        0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C,
+        0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
+        0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E,
+        0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+        0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050,
+        0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
+        0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x0060, 0x0061, 0x0062,
+        0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B,
+        0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
+        0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D,
+        0x007E, 0x007F, 0x20AC, 0x067E, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020,
+        0x2021, 0x02C6, 0x2030, 0x0679, 0x2039, 0x0152, 0x0686, 0x0698, 0x0688,
+        0x06AF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x06A9,
+        0x2122, 0x0691, 0x203A, 0x0153, 0x200C, 0x200D, 0x06BA, 0x00A0, 0x060C,
+        0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x06BE,
+        0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, 0x00B0, 0x00B1, 0x00B2, 0x00B3,
+        0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x061B, 0x00BB, 0x00BC,
+        0x00BD, 0x00BE, 0x061F, 0x06C1, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625,
+        0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E,
+        0x062F, 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x00D7,
+        0x0637, 0x0638, 0x0639, 0x063A, 0x0640, 0x0641, 0x0642, 0x0643, 0x00E0,
+        0x0644, 0x00E2, 0x0645, 0x0646, 0x0647, 0x0648, 0x00E7, 0x00E8, 0x00E9,
+        0x00EA, 0x00EB, 0x0649, 0x064A, 0x00EE, 0x00EF, 0x064B, 0x064C, 0x064D,
+        0x064E, 0x00F4, 0x064F, 0x0650, 0x00F7, 0x0651, 0x00F9, 0x0652, 0x00FB,
+        0x00FC, 0x200E, 0x200F, 0x06D2
+    ];
+    $len = strlen($str);
+    $out = "";
+    for ($i = 0; $i < $len; $i++) {
+        $out .= utf8chr($conv[ord($str[$i])]);
+    }
+    return $out;
+}
+/**
+ * Given a unicode codepoint convert it to UTF-8
+ *
+ * @param int $code  the codepoint to convert
+ * @return string the corresponding UTF-8 string
+ */
+function utf8chr($code)
+{
+    if ($code <= 0x7F)
+        return chr($code);
+    if ($code <= 0x7FF)
+        return pack("C*", ($code >> 6)+192, ($code & 63) + 128);
+    if ($code <= 0xFFFF)
+            return pack("C*", ($code >> 12)+224, (($code>>6) & 63) + 128,
+                ($code&63)+128);
+    if ($code <= 0x1FFFFF)
+        return pack("C*", ($code >> 18) + 240, (($code >> 12) & 63) + 128,
+            (($code >> 6) & 63) + 128, ($code & 63) + 128);
+    return '';
+}
+/**
+ * Function for formatting a date string based on the locale.
+ * @param $timestamp is the crawl time
+ * @param $locale_tag is the tag for locale
+ * @return string formatted date string
+ */
+function formatDateByLocale($timestamp, $locale_tag)
+{
+    switch ($locale_tag) {
+        case 'de':
+            setlocale(LC_ALL,'deu');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'en-US':
+            setlocale(LC_ALL,'enu');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'es':
+            setlocale(LC_ALL,'esp');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'fr-FR':
+            setlocale(LC_ALL,'fra');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'it':
+            setlocale(LC_ALL,'ita');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'ja':
+            setlocale(LC_ALL,'jpn');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'ko':
+            setlocale(LC_ALL,'kor');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'pl':
+            setlocale(LC_ALL,'plk');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'ru':
+            setlocale(LC_ALL,'rus');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        case 'tr':
+            setlocale(LC_ALL,'trk');
+            return strftime("%B %d %Y %H:%M",$timestamp);
+        default:
+            return date("F d Y H:i", intval($timestamp));
+    }
+}
--- a/src/library/MailServer.php
+++ b/src/library/MailServer.php
@ -0,0 +1,390 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\AnalyticsManager;
+use seekquarry\yioop\library\MediaConstants;
+
+/**
+ * Timing functions
+ */
+require_once __DIR__."/Utility.php";
+/**
+ * A small class for communicating with an SMTP server. Used to avoid
+ * configuration issues that might be needed with PHP's built-in mail()
+ * function. Here is an example of how one might use this class:
+ *
+ * $server = new MailServer('somewhere.com', 587, 'someone', 'pword', 'tls');
+ * $to = "cool@place.com";
+ * $from = "someone@somewhere.com";
+ * $subject = "Test Mail";
+ * $message = "This is a test";
+ * $server->send($subject, $from, $to, $message);
+ *
+ * @author Chris Pollett
+ */
+class MailServer implements MediaConstants
+{
+    /**
+     * Email address of default mail sender
+     * @var string
+     */
+    public $sender_email;
+    /**
+     * Hostname of default mail sender
+     * @var string
+     */
+    public $sender_host;
+    /**
+     * Domain name of the SMTP server
+     * @var string
+     */
+    public $server;
+    /**
+     * Port number the mail server is running on
+     * @var int
+     */
+    public $port;
+    /**
+     * If auth is used, the username to log into the SMTP server with
+     * @var string
+     */
+    public $login;
+    /**
+     * If auth is used, the password to log into the SMTP server with
+     * @var string
+     */
+    public $password;
+    /**
+     * Either false if no security/auth used or ssl or tls
+     * @var mixed
+     */
+    public $secure;
+    /**
+     * End of line string for an SMTP server
+     */
+    const EOL = "\r\n";
+    /**
+     * How long before timeout when making a connection to an SMTP server
+     */
+    const SMTP_TIMEOUT = 10;
+    /**
+     * Length of an SMTP response code
+     */
+    const SMTP_CODE_LEN = 3;
+    /**
+     * Service ready for requests
+     */
+    const SERVER_READY = 220;
+    /**
+     * SMTP last action okay
+     */
+    const OKAY = 250;
+    /**
+     * authentication successful
+     */
+    const GO_AHEAD = 235;
+    /**
+     * Send next authentication item
+     */
+    const CONT_REQ = 334;
+    /**
+     * Ready for the actual mail input
+     */
+    const START_INPUT = 354;
+    /**
+     * Encapuslates the domain and credentials of a SMTP server
+     * in a MailServer object
+     *
+     * @param string $sender_email who mail will be sent from (can be
+     *     overwritten)
+     * @param string $server domain name of machine will connect to
+     * @param int $port port on that machine
+     * @param string $login username to use for authentication ("" if no
+     *     auth)
+     * @param string $password password to use for authentication ("" if no
+     *     auth)
+     * @param mixed $secure false is SSL and TLS not used, otherwise SSL or TLS
+     */
+    public function __construct($sender_email, $server, $port, $login,
+        $password, $secure = false)
+    {
+        $this->sender_email = $sender_email;
+        $mail_parts = explode("@", $this->sender_email);
+        $this->sender_host = (isset($mail_parts[1])) ? $mail_parts[1] :
+            "dev.null";
+        $this->server = $server;
+        if ($secure == "ssl") {
+            'ssl://'.$server;
+        }
+        $this->port = $port;
+        $this->login = $login;
+        $this->password = $password;
+        $this->secure = $secure;
+        $this->connection = null;
+        $this->messages = "";
+    }
+    /**
+     * Connects to and if needs be authenticates with a SMTP server
+     *
+     * @return bool whether the session was successfully established
+     */
+    public function startSession()
+    {
+        $this->connection = fsockopen($this->server, $this->port, $errno,
+            $errstr, self::SMTP_TIMEOUT);
+        if (!$this->connection) {
+            $this->messages .= "Could not connect to smtp server\n";
+            return false;
+        }
+        if ($this->readResponseGetCode() != self::SERVER_READY) {
+            $this->messages .= "SMTP error\n";
+            return false;
+        }
+        $hostname = $this->sender_host;
+        $this->smtpCommand("HELO $hostname");
+        if ($this->secure == 'tls') {
+            if ($this->smtpCommand('STARTTLS') != self::SERVER_READY) {
+                $this->messages .= "Cannot start TLS\n";
+                return false;
+            }
+            stream_socket_enable_crypto($this->connection, true,
+                STREAM_CRYPTO_METHOD_TLS_CLIENT);
+            if ($this->smtpCommand("HELO $hostname") != self::OKAY) {
+                $this->messages .= "TLS HELO error\n";
+                return false;
+            }
+        }
+        if ($this->login != "" && $this->password != "") {
+            if ($this->smtpCommand('AUTH LOGIN') != self::CONT_REQ) {
+                $this->messages .= "Authentication Error Auth Login\n";
+                return false;
+            }
+            if ($this->smtpCommand(base64_encode($this->login))
+                != self::CONT_REQ) {
+                $this->messages .= "Authentication Error Username Transition\n";
+                return false;
+            }
+            if ($this->smtpCommand(base64_encode($this->password)) !=
+                self::GO_AHEAD) {
+                $this->messages .= "Authentication Error Password Transition\n";
+                return false;
+            }
+        }
+        return true;
+    }
+    /**
+     * Closes the currently active SMTP session
+     */
+    public function endSession()
+    {
+        $this->smtpCommand('QUIT');
+        fclose($this->connection);
+    }
+    /**
+     * Reads data from an SMTP server until a command response code detected
+     *
+     * @return string three byte response code
+     */
+    public function readResponseGetCode()
+    {
+        $data = "";
+        while($line = fgets($this->connection)) {
+            $data .= $line;
+            if ($line[self::SMTP_CODE_LEN] == ' ') { break; }
+        }
+        $this->messages .= $data;
+        return substr($data, 0, self::SMTP_CODE_LEN);
+    }
+    /**
+     * Sends a single SMTP command to the current SMTP server and
+     * then returns the SMTP response code
+     *
+     * @param string $command the command to execute
+     * @return string three character integer response code
+     */
+    public function smtpCommand($command)
+    {
+        $this->messages .= htmlentities($command)."\n";
+        fputs($this->connection, $command . self::EOL);
+        return $this->readResponseGetCode();
+    }
+    /**
+     * Sends (or queues for media updater)an email
+     * (much like PHP's mail command, but not requiring
+     * a configured smtp server on the current machine)
+     *
+     * @param string $subject subject line of the email
+     * @param string $from sender email address
+     * @param string $to recipient email address
+     * @param string $message message body for the email
+     */
+    public function send($subject, $from, $to, $message)
+    {
+        $start_time = microtime(true);
+        if ($from == "") {
+            $from = $this->sender_email;
+        }
+        if (C\SEND_MAIL_MEDIA_UPDATER == "true") {
+            $this->sendQueue($subject, $from, $to, $message);
+        } else {
+            $this->sendImmediate($subject, $from, $to, $message);
+        }
+
+        if (C\QUERY_STATISTICS) {
+            $current_messages = AnalyticsManager::get("MAIL_MESSAGES");
+            if (!$current_messages) {
+                $current_messages = [];
+            }
+            $total_time = AnalyticsManager::get("MAIL_TOTAL_TIME");
+            if (!$total_time) {
+                $total_time = 0;
+            }
+            $elapsed_time = changeInMicrotime($start_time);
+            $total_time += $elapsed_time;
+            $current_messages[] = [
+                "QUERY" => "<p>Send Mail</p>".
+                    "<pre>" . wordwrap($this->messages, 60, "\n", true) .
+                    "</pre>",
+                "ELAPSED_TIME" => $elapsed_time
+            ];
+            AnalyticsManager::set("MAIL_MESSAGES", $current_messages);
+            AnalyticsManager::set("MAIL_TOTAL_TIME", $total_time);
+        }
+    }
+    /**
+     * Sends immediately an email (as opposed to queueing for a future process
+     * to send)
+     *
+     * @param string $subject subject line of the email
+     * @param string $from sender email address
+     * @param string $to recipient email address
+     * @param string $message message body for the email
+     */
+    public function sendImmediate($subject, $from, $to, $message)
+    {
+        $eol = self::EOL;
+        if (C\USE_MAIL_PHP) {
+            $header = "From: " . $from . $eol;
+            mail($to, $subject, $message, $header);
+            return;
+        }
+        $this->messages = "";
+        $mail  = "Date: " . date(DATE_RFC822) . $eol;
+        $mail .= "Subject: " . $subject . $eol;
+        $mail .= "From: " . $from . $eol;
+        $mail .= "To: ". $to . $eol;
+        $mail .= $eol . $eol . $message. $eol . ".";
+        $commands = [
+            "MAIL FROM: <$from>" => self::OKAY,
+            "RCPT TO: <$to>" => self::OKAY,
+            "DATA" => self::START_INPUT,
+            $mail => self::OKAY
+        ];
+        if ($this->startSession()) {
+            foreach ($commands as $command => $good_response) {
+                $response = $this->smtpCommand($command);
+                if ($response != $good_response) {
+                    $this->messages .=
+                        "$command failed!! $response $good_response\n";
+                    break;
+                }
+            }
+            $this->endSession();
+        }
+    }
+
+    /**
+     * Sends an email to the media updater mail queue
+     *
+     * @param string $subject subject line of the email
+     * @param string $from sender email address
+     * @param string $to recipient email address
+     * @param string $message message body for the email
+     */
+    public function sendQueue($subject, $from, $to, $message)
+    {
+        $mail_directory = C\WORK_DIRECTORY . self::MAIL_FOLDER;
+        if (!file_exists($mail_directory)) {
+            mkdir($mail_directory);
+            setWorldPermissions($mail_directory);
+            if (!file_exists($mail_directory)) {
+                crawlLog("Could not create mail directory!");
+                exit();
+            }
+        }
+        $files = glob($mail_directory."/*.txt");
+        $file_count = count($files);
+        $current_count = 0;
+        $current_time = time();
+        $diff = 0;
+        if ($file_count > 0) {
+            $file = end($files);
+            $file_name = str_replace($mail_directory."/", "", $file);
+            $last_file_time = substr($file_name, 0, -4);
+            $diff = $current_time - $last_file_time;
+        }
+        $mail_details = serialize(array($subject, $from, $to, $message));
+        $this->messages = "Queuing: " . $mail_details;
+        if ($diff > C\MAX_MAIL_TIMESTAMP_LIMIT || $file_count == 0)
+        {
+            crawlLog("...Creating a new file for next mailer batch!\n");
+            $file_time = time();
+            $fp = fopen($mail_directory . "/" . $file_time . ".txt", "a+");
+            if (flock($fp, LOCK_EX | LOCK_NB)) {
+                crawlLog("....Lock for mail file acquired!" .
+                    " Sending emails!\n");
+                fwrite($fp, self::MESSAGE_SEPARATOR . $mail_details);
+                fwrite($fp, PHP_EOL);
+                flock($fp, LOCK_UN);
+                setWorldPermissions($mail_directory . "/" .
+                    $file_time . ".txt");
+            } else {
+                crawlLog("Could not acquire the lock " .
+                    " for $file_time.txt!\n");
+            }
+        } else {
+            $fp = fopen($mail_directory."/".$last_file_time.".txt", "a+");
+            if (flock($fp, LOCK_EX | LOCK_NB)) {
+                crawlLog("....Lock acquired! Sending emails now!\n");
+                fwrite($fp, $mail_details);
+                fwrite($fp, PHP_EOL);
+                flock($fp, LOCK_UN);
+                setWorldPermissions($mail_directory . "/" .
+                    $last_file_time . ".txt");
+            } else {
+                crawlLog("Could not acquire the lock! for $file!\n");
+            }
+        }
+        return;
+    }
+}
--- a/src/library/MediaConstants.php
+++ b/src/library/MediaConstants.php
@ -0,0 +1,77 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * Shared constants and enums used by components that are involved in the
+ * media related operations
+ *
+ * @author Chris Pollett
+ */
+interface MediaConstants
+{
+     /**
+     * Used to define folder used for
+     * placing video files to be converted.
+     */
+    const CONVERT_FOLDER = "/schedules/media_convert";
+     /**
+     * Used to define folder used for
+     * placing video files after conversion.
+     */
+    const CONVERTED_FOLDER = "/schedules/media_converted";
+    /**
+     * The text file used to recognize the video file is
+     * about to be split.
+     */
+    const SPLIT_FILE = "/split.txt";
+    /* The text file used to store the info of video file. */
+    const FILE_INFO = "/file_info.txt";
+    /**
+     * The text file used to store the count of split files
+     * generated from a video file.
+     */
+    const COUNT_FILE = "/count.txt";
+    /**
+     * The text file used to store the list of split file
+     * names to concatenate them.
+     */
+    const ASSEMBLE_FILE = "/ready_to_assemble.txt";
+    /**
+     * Used to place text files(mailer lists) for sending
+     * in batches.
+     */
+    const MAIL_FOLDER = "/schedules/mail";
+    /**
+     * Magic string used to separate mail messages
+     */
+    const MESSAGE_SEPARATOR = "+-7b6Ze3ef#a";
+}
--- a/src/library/NWordGrams.php
+++ b/src/library/NWordGrams.php
@ -0,0 +1,341 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Ravi Dhillon ravi.dhillon@yahoo.com, Chris Pollett
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Library of functions used to create and extract n word grams
+ *
+ * @author Ravi Dhillon (Bigram Version), Chris Pollett (ngrams + rewrite +
+ * support for page count dumps)
+ */
+class NWordGrams
+{
+    /**
+     * Static copy of n-grams files
+     * @var object
+     */
+    protected static $ngrams = null;
+    /**
+     * How many bytes to read in one go from wiki file when creating filter
+     */
+    const BLOCK_SIZE = 8192;
+    /**
+     * Suffix appended to language tag to create the
+     * filter file name containing bigrams.
+     */
+    const FILTER_SUFFIX = "_word_grams.ftr";
+    /**
+     * Suffix appended to language tag to create the
+     * text file name containing bigrams.
+     */
+    const TEXT_SUFFIX = "_word_grams.txt";
+    const WIKI_DUMP_REDIRECT = 0;
+    const WIKI_DUMP_TITLE = 1;
+    const PAGE_COUNT_WIKIPEDIA = 2;
+    const PAGE_COUNT_WIKTIONARY = 3;
+    /**
+     * Says whether or not phrase exists in the N word gram Bloom Filter
+     *
+     * @param $phrase what to check if is a bigram
+     * @param string $lang language of bigrams file
+     * @param string $filter_prefix either the word "segment", "all", or
+     *     number n of the number of words in an ngram in filter.
+     * @return true or false
+     */
+    public static function ngramsContains($phrase, $lang, $filter_prefix = 2)
+    {
+        $lang = str_replace("-", "_", $lang);
+        if (self::$ngrams == null || !isset(self::$ngrams[$filter_prefix])) {
+            $filter_path = C\LOCALE_DIR . "/$lang/resources/" .
+                "{$filter_prefix}" . self::FILTER_SUFFIX;
+            if (file_exists($filter_path)) {
+                self::$ngrams[$filter_prefix] =
+                    BloomFilterFile::load($filter_path);
+            } else  {
+                return false;
+            }
+        }
+        return self::$ngrams[$filter_prefix]->contains(mb_strtolower($phrase));
+    }
+    /**
+     * Creates a bloom filter file from a n word gram text file. The
+     * path of n word gram text file used is based on the input $lang.
+     * The name of output filter file is based on the $lang and the
+     * number n. Size is based on input number of n word grams .
+     * The n word grams are read from text file, stemmed if a stemmer
+     * is available for $lang and then stored in filter file.
+     *
+     * @param string $lang locale to be used to stem n grams.
+     * @param string $num_gram value of n in n-gram (how many words in sequence
+     *      should constitute a gram)
+     * @param int $num_ngrams_found count of n word grams in text file.
+     * @param int $max_gram_len value n of longest n gram to be added.
+     * @return none
+     */
+    public static function makeNWordGramsFilterFile($lang, $num_gram,
+        $num_ngrams_found, $max_gram_len = 2)
+    {
+        $lang = str_replace("-", "_", $lang);
+        $filter_path = C\LOCALE_DIR . "/$lang/resources/" .
+            "{$num_gram}" . self::FILTER_SUFFIX;
+        if (file_exists($filter_path)) {
+            unlink($filter_path); //build again from scratch
+        }
+        $ngrams = new BloomFilterFile($filter_path, $num_ngrams_found);
+
+        $inputFilePath = C\LOCALE_DIR . "/$lang/resources/" .
+            "{$num_gram}" .  self::TEXT_SUFFIX;
+        $fp = fopen($inputFilePath, 'r') or die("Can't open ngrams text file");
+        while ( ($ngram = fgets($fp)) !== false) {
+          $words = PhraseParser::stemTerms(trim($ngram), $lang);
+          if (strlen($words[0]) == 1) { // get rid of n grams like "a dog"
+              continue;
+          }
+          $ngram_stemmed = implode(" ", $words);
+          $ngrams->add(mb_strtolower($ngram_stemmed));
+        }
+        fclose($fp);
+        $ngrams->max_gram_len = $max_gram_len;
+        $ngrams->save();
+    }
+    /**
+     * Used to create a filter file suitable for use in word segmentation
+     * (splitting text like "thiscontainsnospaces" into
+     * "this contains no spaces"). Used by @see token_tool.php
+     *
+     * @param string $dict_file file to use as a dictionary to make filter from
+     * @param string $lang locale tag of locale we are building the filter for
+     */
+    public static function makeSegmentFilterFile($dict_file, $lang)
+    {
+        $lang = str_replace("-", "_", $lang);
+        $filter_path = C\LOCALE_DIR . "/$lang/resources/" .
+            "segment" . self::FILTER_SUFFIX;
+        if (file_exists($filter_path)) {
+            unlink($filter_path); //build again from scratch
+        }
+        $words = file($dict_file);
+        $filter = new BloomFilterFile($filter_path, count($words));
+        foreach ($words as $word) {
+            $tmp = trim($word);
+            $len = mb_strlen($tmp);
+            $filter->add(mb_strtolower($tmp));;
+            for ($i = 1; $i < $len; $i++) {
+                $tmp2 = "*" . mb_substr($tmp, $i, $len, "UTF-8");
+                if ($tmp2 == "*") {continue;}
+                $filter->add(mb_strtolower($tmp2));
+            }
+        }
+        $filter->save();
+    }
+    /**
+     * Generates a n word grams text file from input wikipedia xml file.
+     * The input file can be a bz2 compressed or uncompressed.
+     * The input XML file is parsed line by line and pattern for
+     * n word gram is searched. If a n word gram is found it is added to the
+     * array. After the complete file is parsed we remove the duplicate
+     * n word grams and sort them. The resulting array is written to the
+     * text file. The function returns the number of bigrams stored in
+     * the text file.
+     *
+     * @param string $wiki_file compressed or uncompressed wikipedia
+     *     XML file path to be used to extract bigrams. This can also
+     *     be a folder containing such files
+     * @param string $lang Language to be used to create n grams.
+     * @param string $locale Locale to be used to store results.
+     * @param int $num_gram number of words in grams we are looking for
+     * @param int $ngram_type where in Wiki Dump to extract grams from
+     * @param int $max_terms maximum number of n-grams to compute and put in
+     *      file
+     * @return int $num_ngrams_found count of bigrams in text file.
+     */
+    public static function makeNWordGramsTextFile($wiki_file, $lang,
+        $locale, $num_gram = 2, $ngram_type = self::PAGE_COUNT_WIKIPEDIA,
+        $max_terms = -1)
+    {
+        $output_message_threshold = self::BLOCK_SIZE * self::BLOCK_SIZE;
+        $is_count_type = false;
+        switch ($ngram_type) {
+            case self::WIKI_DUMP_TITLE:
+                $pattern = '/<title>[^\p{P}]+';
+                $pattern_end = '<\/title>/u';
+                $replace_array = ['<title>','</title>'];
+                break;
+            case self::WIKI_DUMP_REDIRECT:
+                $pattern = '/#redirect\s\[\[[^\p{P}]+';
+                $pattern_end='\]\]/u';
+                $replace_array = ['#redirect [[',']]'];
+                break;
+            case self::PAGE_COUNT_WIKIPEDIA:
+                $pattern = '/^'.$lang.'\s[^\p{P}]+';
+                $pattern_end='/u';
+                $is_count_type = true;
+                break;
+            case self::PAGE_COUNT_WIKTIONARY:
+                $pattern = '/^'.$lang.'.d\s[^\p{P}]+';
+                $pattern_end='/u';
+                $is_count_type = true;
+                break;
+        }
+        $is_all = false;
+        $repeat_pattern = "[\s|_][^\p{P}]+";
+        if ($num_gram == "all" || $is_count_type) {
+            $pattern .= "($repeat_pattern)+";
+            if ($num_gram == "all") {
+                $is_all = true;
+            }
+            $max_gram_len = -1;
+        } else {
+            for ($i = 1; $i < $num_gram; $i++) {
+                $pattern .= $repeat_pattern;
+            }
+            $max_gram_len = $num_gram;
+        }
+        $pattern .= $pattern_end;
+        $replace_types = [self::WIKI_DUMP_TITLE, self::WIKI_DUMP_REDIRECT];
+
+        if (is_dir(C\PREP_DIR."/$wiki_file") ) {
+            $folder_files = glob(C\PREP_DIR."/$wiki_file/*.{gz,bz}",
+                GLOB_BRACE);
+        } else {
+            $folder_files = [C\PREP_DIR."/$wiki_file"];
+        }
+        $ngrams = [];
+        foreach ($folder_files as $wiki_file_path) {
+            if (strpos($wiki_file_path, "bz2") !== false) {
+                $fr = bzopen($wiki_file_path, 'r') or
+                    die ("Can't open compressed file");
+                $read = "bzread";
+                $close = "bzclose";
+            } else if (strpos($wiki_file_path, "gz") !== false) {
+                $fr = gzopen($wiki_file_path, 'r') or
+                    die ("Can't open compressed file");
+                $read = "gzread";
+                $close = "gzclose";
+            } else {
+                $fr = fopen($wiki_file_path, 'r') or die("Can't open file");
+                $read = "fread";
+                $close = "fclose";
+            }
+            $ngrams_file_path
+                = C\LOCALE_DIR . "/$locale/resources/" . "{$num_gram}" .
+                    self::TEXT_SUFFIX;
+            $input_buffer = "";
+            $time = time();
+            echo "Reading wiki file ...$wiki_file_path...\n";
+            $bytes = 0;
+            $bytes_since_last_output = 0;
+            while (!feof($fr)) {
+                $input_text = $read($fr, self::BLOCK_SIZE);
+                $len = strlen($input_text);
+                if ($len == 0) break;
+                $bytes += $len;
+                $bytes_since_last_output += $len;
+                if ($bytes_since_last_output > $output_message_threshold) {
+                    echo "Have now read ".$bytes." many bytes." .
+                        " Peak memory so far: ".memory_get_peak_usage().
+                        ".\n     Number of word grams so far: ".count($ngrams).
+                        ". Elapsed time so far: ".(time() - $time)."s\n";
+                    $bytes_since_last_output = 0;
+                }
+                $input_buffer .= mb_strtolower($input_text);
+                $lines = explode("\n", $input_buffer);
+                $input_buffer = array_pop($lines);
+                foreach ($lines as $line) {
+                    preg_match($pattern, $line, $matches);
+                    if (count($matches) > 0) {
+                        if ($is_count_type) {
+                            $line_parts = explode(" ", $matches[0]);
+                            if (isset($line_parts[1]) &&
+                                isset($line_parts[2])) {
+                                $ngram=mb_ereg_replace("_", " ",$line_parts[1]);
+                                $char_grams =
+                                    PhraseParser::getCharGramsTerm(
+                                        [$ngram],$locale);
+                                $ngram = implode(" ", $char_grams);
+                                $ngram_num_words=mb_substr_count($ngram, " ")+1;
+                                if (($is_all && $ngram_num_words > 1) ||
+                                    (!$is_all &&$ngram_num_words == $num_gram)){
+                                    $ngrams[$ngram] = $line_parts[2];
+                                }
+                            }
+                        } else {
+                            $ngram = mb_ereg_replace(
+                                $replace_array, "", $matches[0]);
+                            $ngram = mb_ereg_replace("_", " ", $ngram);
+
+                            $ngrams[] = $ngram;
+                        }
+                        if ($is_all && isset($ngram)) {
+                            $ngram_num_words = mb_substr_count($ngram, " ") + 1;
+                            $max_gram_len = max($max_gram_len,$ngram_num_words);
+                        }
+                    }
+                }
+            }
+        }
+        if ($is_count_type) {
+            arsort($ngrams);
+            $ngrams = array_keys($ngrams);
+        }
+        $ngrams = array_unique($ngrams);
+        $num_ngrams_found = count($ngrams);
+        if ($max_terms > 0 && $num_ngrams_found > $max_terms) {
+            $ngrams = array_slice($ngrams, 0, $max_terms);
+        }
+        $num_ngrams_found = count($ngrams);
+        // in is_all case add prefix*'s for (n >= 3)-grams
+        if ($is_all) {
+            for ($i = 0; $i < $num_ngrams_found; $i++) {
+                $ngram_in_word =  mb_substr_count($ngrams[$i], " ")+1;
+                if ($ngram_in_word >= 3) {
+                    $ngram_parts = explode(" ", $ngrams[$i]);
+                    $ngram = $ngram_parts[0];
+                    for ($j = 1; $j < $ngram_in_word - 1;  $j++ ) {
+                        $ngram .= " ".$ngram_parts[$j];
+                        $ngrams[] = $ngram."*";
+                    }
+                }
+            }
+            $ngrams = array_unique($ngrams);
+            $num_ngrams_found = count($ngrams);
+        }
+        sort($ngrams);
+        $ngrams_string = implode("\n", $ngrams);
+        file_put_contents($ngrams_file_path, $ngrams_string);
+        $close($fr);
+        return [$num_ngrams_found, $max_gram_len];
+    }
+}
--- a/src/library/Notifier.php
+++ b/src/library/Notifier.php
@ -0,0 +1,63 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * A Notifier is an object which will be notified by a priority queue
+ * when the index in the queue viewed as array of some data item has been
+ * changed.
+ *
+ * A Notifier is notified when the index in the queue viewed as array of some
+ * data item has been changed, this gives the Notifier object the ability to
+ * update its value of the index for that data item. As an example, in the
+ * search engine, the WebQueueBundle class implements Notifier. Web queue
+ * bundles store url together with their weights and allow one to get out the
+ * url of highest weight. This is implemented by storing in a PriorityQueue
+ * keys consisting of hashes of urls (as fixed length) and values consisting of
+ * the weight. Then in a web archive the url and its index in the priority
+ * queue is stored. When the index in the queue changes, the WebQueueBundle's
+ * notify method is called to adjust the index that is stored in the web
+ * archive.
+ *
+ * @author Chris Pollett
+ * @see WebQueueBundle
+ */
+interface Notifier
+{
+    /**
+     * Handles the update of the index of a data item in a queue with respect
+     * to the Notifier object.
+     *
+     * @param int $index  the index of a row in a heap-based priority queue
+     * @param mixed $data  the data that is stored at that index
+     */
+    public function notify($index, $data);
+}
--- a/src/library/PageRuleParser.php
+++ b/src/library/PageRuleParser.php
@ -0,0 +1,553 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Has methods to parse user-defined page rules to apply documents
+ * to be indexed.
+ *
+ * There are two types of statements that a user can define:
+ * command statements and assignment statements
+ *
+ * A command statement takes a key field argument for the page associative array
+ * and does a function call to manipulate that page.
+ * These have the syntax:
+ * addMetaWords(field)       ;add the field and field value to the META_WORD
+ *                          ;array for the page
+ * addKeywordLink(field)     ;split the field on a comma, view this as a search
+ *                          ;keywords => link text association, and add this to
+ *                          ;the KEYWORD_LINKS array.
+ * setStack(field)           ;set which field value should be used as a stack
+ * pushStack(field)          ;add the field value for field to the top of stack
+ * popStack(field)           ;pop the top of the stack into the field value for
+ *                          ;field
+ * setOutputFolder(dir)      ;if auxiliary output, rather than just to the
+ *                          ; a yioop index, is being done, then set the folder
+ *                          ; for this output to be dir
+ * setOutputFormat(format)   ;format of auxiliary output either CSV or SQL
+ *                          ;SQL mean that writeOutput will write an insert
+ *                          ;statement
+ * setOutputTable(table)     ;if output is SQL then what table to use for the
+ *                          ;insert statements
+ * toArray(field)            ;splits field value for field on a comma and
+ *                          ;assign field value to be the resulting array
+ * toString(field)           ;if field value is an array then implode that
+ *                          ;array using comma and store the result in field
+ *                          ;value
+ * unset(field)              ;unset that field value
+ * writeOutput(field)        ;use the contents of field value viewed as an array
+ *                          ;to fill in the columns of a SQL insert statement
+ *                          ;or CSV row
+ *
+ * Assignments can either be straight assignments with '=' or concatenation
+ * assignments with '.='. There are the following kinds of values that one
+ * can assign:
+ *
+ * field = some_other_field ; sets $page['field'] = $page['some_other_field']
+ * field = "some_string" ; sets $page['field'] to "some string"
+ * field = /some_regex/replacement_where_dollar_vars_allowed/
+ *    ; computes the results of replacing matches to some_regex in
+ *    ; $page['field'] with replacement_where_dollar_vars_allowed
+ * field = /some_regex/g ;sets $page['field'] to the array of all matches
+ *    ; of some regex in $page['field']
+ *
+ * For each of the above assignments we could have used ".=" instead of "="
+ *
+ * @author Chris Pollett
+ */
+class PageRuleParser implements CrawlConstants
+{
+    /**
+     * Used to store parse trees that this parser executes
+     * @var array
+     */
+    public $rule_trees;
+    /**
+     * If outputting to auxiliary file is being done, the current folder to
+     * use for such output
+     *
+     * @var string
+     */
+    public $output_folder="";
+    /**
+     * If outputting to auxiliary file is being done, the current file format
+     * to output with (either SQL or CSV)
+     *
+     * @var string
+     */
+    public $output_format="";
+
+    /**
+     * If outputting to auxiliary file is being done, and the current file
+     * format is SQL then what table to output insert statements for
+     *
+     * @var string
+     */
+    public $output_table="";
+    /**
+     * Name of field which will be used as a stack for push and popping other
+     * fields values
+     *
+     * @var string
+     */
+    public $stack;
+    /**
+     * Constructs a PageRuleParser using the supplied page_rules
+     *
+     * @param string $page_rules a sequence of lines with page rules
+     *     as described in the class comments
+     */
+    public function __construct($page_rules = "")
+    {
+        $this->rule_trees = $this->parseRules($page_rules);
+    }
+    /**
+     * Parses a string of pages rules into parse trees that can be executed
+     * later
+     *
+     * @param string $page_rules a sequence of lines with page rules
+     *     as described in the class comments
+     * @return array of parse trees which can be executed in sequence
+     */
+    public function parseRules($page_rules)
+    {
+        $quote_string = '"([^"\\\\]*(\\.[^"\\\\]*)*)"';
+        $blank = '[ \t]';
+        $comment = $blank.'*;[^\n]*';
+        $literal = '\w+';
+        $assignment = '\.?=';
+        $start = '(?:\A|\n)';
+        $end = '(?:\n|\Z)';
+        $sub_or_match_all = '(/[^/\n]+/)(g|([^/\n]*)/)';
+        $command = '(\w+)'."$blank*".'\('."$blank*".'([\w\/]+)'.
+            "$blank*".'\)';
+        $rule =
+            "@(?:$command$blank*($comment)?$end".
+            "|$blank*($literal)$blank*($assignment)$blank*".
+            "((".$quote_string.")|($literal)|($sub_or_match_all))".
+            "$blank*($comment)?$end)@";
+        $matches = [];
+        preg_match_all($rule, $page_rules, $matches);
+        $rule_trees = [];
+        if (!isset($matches[0]) ||
+            ($num_rules = count($matches[0])) == 0) { return $rule_trees; }
+        for ($i = 0; $i < $num_rules; $i++) {
+            $tree = [];
+            if ($matches[1][$i] != "" || $matches[3][$i] != "") {
+                $tree["func_call"] = $matches[1][$i];
+                if (isset($matches[2][$i])) {
+                    $tree["arg"] = $matches[2][$i];
+                } else if (isset($matches[4][$i])) {
+                    $tree["arg"] = $matches[4][$i];
+                } else {
+                    $tree["arg"] = "";
+                }
+            } else {
+                $tree["var"] = $matches[4][$i];
+                $tree["assign_op"] = $matches[5][$i];
+                $value_type_indicator = $matches[6][$i][0];
+                if ($value_type_indicator == '"') {
+                    $tree["value_type"] = "string";
+                    $tree["value"] = $matches[8][$i];
+                } else if ($value_type_indicator == '/') {
+                    if (substr($matches[6][$i],-1) == "g") {
+                        $tree["value_type"] = "match_all";
+                    } else {
+                        $tree["value_type"] = "substitution";
+                    }
+                    $tree["value"] = [$matches[12][$i], $matches[13][$i]];
+                } else {
+                    $tree["value_type"] = "literal";
+                    $tree["value"] = $matches[10][$i];
+                }
+            }
+            $rule_trees[] = $tree;
+        }
+        return $rule_trees;
+    }
+    /**
+     * Executes either the internal $rule_trees or the passed $rule_trees
+     * on the provided $page_data associative array
+     *
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record (will be changed by this operation)
+     * @param array $rule_trees an array of annotated syntax trees to
+     *     for rules used to update $page_data
+     */
+    public function executeRuleTrees(&$page_data, $rule_trees = null)
+    {
+        if ($rule_trees == null) {
+            $rule_trees = & $this->rule_trees;
+        }
+        foreach ($rule_trees as $tree) {
+            if (isset($tree['func_call'])) {
+                $this->executeFunctionRule($tree, $page_data);
+            } else {
+                $this->executeAssignmentRule($tree, $page_data);
+            }
+        }
+    }
+    /**
+     * Used to execute a single command rule on $page_data
+     *
+     * @param array $tree annotated syntax tree of a function call rule
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record (will be changed by this operation)
+     */
+    public function executeFunctionRule($tree, &$page_data)
+    {
+        $allowed_functions = ["addMetaWord" => "addMetaWord",
+            "addKeywordLink" => "addKeywordLink",
+            "setOutputFolder" => "setOutputFolder",
+            "setOutputFormat" => "setOutputFormat",
+            "setOutputTable" => "setOutputTable",
+            "setStack" => "setStack",
+            "pushStack" => "pushStack",
+            "popStack" => "popStack",
+            "toArray" => "toArray",
+            "toString" => "toString",
+            "unset" => "unsetVariable",
+            "writeOutput" => "writeOutput"
+        ];
+        if (in_array($tree['func_call'], array_keys($allowed_functions))) {
+            $func = $allowed_functions[$tree['func_call']];
+            $this->$func($tree['arg'], $page_data);
+        }
+    }
+    /**
+     * Used to execute a single assignment rule on $page_data
+     *
+     * @param array $tree annotated syntax tree of an assignment rule
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record (will be changed by this operation)
+     */
+    public function executeAssignmentRule($tree, &$page_data)
+    {
+        $field = $this->getVarField($tree["var"]);
+        if (!isset($page_data[$field])) {
+            $page_data[$field] = "";
+        }
+        $value = "";
+        switch ($tree['value_type']) {
+            case "literal":
+                $literal = $this->getVarField($tree["value"]);
+                if (isset($page_data[$literal])) {
+                    $value = $page_data[$literal];
+                }
+                break;
+            case "string":
+                $value = $tree["value"];
+                break;
+            case "substitution":
+                $value = preg_replace($tree["value"][0], $tree["value"][1],
+                    $page_data[$field]);
+                break;
+            case "match_all":
+                preg_match_all($tree["value"][0], $tree["value"][1],
+                    $page_data[$field], $value);
+                break;
+        }
+        if ($tree["assign_op"] == "=") {
+            $page_data[$field] = $value;
+        } else {
+            $page_data[$field] .= $value;
+        }
+    }
+    /**
+     * Either returns $var_name or the value of the CrawlConstant with name
+     * $var_name.
+     *
+     * @param string $var_name field to look up
+     * @return string looked up value
+     */
+    public function getVarField($var_name)
+    {
+        if (defined("CrawlConstants::$var_name")) {
+            return constant("CrawlConstants::$var_name");
+        }
+        return $var_name;
+    }
+    /**
+     * Adds a meta word u:$field:$page_data[$field_name] to the array
+     * of meta words for this page
+     *
+     * @param $field the key in $page_data to use
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function addMetaWord($field, &$page_data)
+    {
+        $field_name = $this->getVarField($field);
+        if (!isset($page_data[$field_name])) {return; }
+        $meta_word = "u:$field_name:{$page_data[$field_name]}";
+        if (!isset($page_data[CrawlConstants::META_WORDS])) {
+            $page_data[CrawlConstants::META_WORDS] = [];
+        }
+        $page_data[CrawlConstants::META_WORDS][] = $meta_word;
+    }
+    /**
+     * Adds a $keywords => $link_text pair to the KEYWORD_LINKS array fro
+     * this page based on the value $field on the page. The pair is extracted
+     * by splitting on comma. The KEYWORD_LINKS array can be used when
+     * a cached version of a page is displayed to show a list of links
+     * from the cached page in the header. These links correspond to search
+     * in Yioop. for example the value:
+     * madonna, rock star
+     * would add a link to the top of the cache page with text "rock star"
+     * which when clicked would perform a Yioop search on madonna.
+     *
+     * @param $field the key in $page_data to use
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function addKeywordLink($field, &$page_data)
+    {
+        $field_name = $this->getVarField($field);
+        if (!isset($page_data[$field_name])) {return; }
+        $link_parts = explode(",", $page_data[$field_name]);
+        if (count($link_parts) < 2) {return; }
+        list($key_words, $link_text) = $link_parts;
+        if (!isset($page_data[CrawlConstants::KEYWORD_LINKS])) {
+            $page_data[CrawlConstants::KEYWORD_LINKS] = [];
+        }
+        $page_data[CrawlConstants::KEYWORD_LINKS][$key_words] = $link_text;
+    }
+    /**
+     * Set field variable to be used as a stack
+     *
+     * @param $field what field variable to use for current stack
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function setStack($field, &$page_data)
+    {
+        $this->stack = $this->getVarField($field);
+        if (!isset($page_data[$this->stack]) ||
+            (!is_string($page_data[$this->stack]) &&
+            !is_array($page_data[$this->stack]) )) {
+            $page_data[$this->stack] = [];
+        } else if (is_string($page_data[$this->stack])) {
+            $page_data[$this->stack] = [$page_data[$this->stack]];
+        }
+    }
+    /**
+     * Pushes an element or items in an array stored in field onto the current
+     * stack
+     *
+     * @param $field what field  to get data to push onto fcurrent stack
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function pushStack($field, &$page_data)
+    {
+        $var_field = $this->getVarField($field);
+        if (!isset($page_data[$this->stack]) || !isset($page_data[$var_field])
+            || (!is_string($page_data[$var_field])
+            && !is_array($page_data[$var_field])) ) {
+            return;
+        }
+        if (is_string($page_data[$var_field])) {
+            $page_data[$this->stack][] = $page_data[$var_field];
+        } else {
+            $this->stack = array_merge($page_data[$this->stack],
+                $page_data[$var_field]);
+        }
+    }
+    /**
+     * Pop an element or items in an array stored in field onto the current
+     * stack
+     *
+     * @param $field what field  to get data to push onto fcurrent stack
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function popStack($field, &$page_data)
+    {
+        $var_field = $this->getVarField($field);
+        if (!isset($page_data[$this->stack]) ) {
+            return;
+        }
+        $page_data[$var_field] = array_pop($page_data[$this->stack]);
+    }
+    /**
+     * Set output folder
+     *
+     * @param $dir output directory in which to write data.txt files containing
+     *     the contents of some fields after writeOutput commands
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function setOutputFolder($dir, &$page_data)
+    {
+        $this->output_folder = realpath(trim($dir));
+    }
+    /**
+     * Set output format
+     *
+     * @param $format can be either csv or sql
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function setOutputFormat($format, &$page_data)
+    {
+        if (in_array($format, ["csv", "sql"])) {
+            $this->output_format = $format;
+        }
+    }
+    /**
+     * Set output table
+     *
+     * @param $table table to use if output format is sql
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function setOutputTable($table, &$page_data)
+    {
+            $this->output_table = $table;
+    }
+    /**
+     * If $page_data[$field] is a string, splits it into an array on comma,
+     * trims leading and trailing spaces from each item and stores the result
+     * back into $page_data[$field]
+     *
+     *
+     * @param $field the key in $page_data to use
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function toArray($field, &$page_data)
+    {
+        $var_field = $this->getVarField($field);
+        if (is_string($page_data[$var_field])) {
+            $field_parts = explode(",", $page_data[$var_field]);
+            $page_data[$var_field] = [];
+            foreach ($field_parts as $part) {
+                $page_data[$var_field][] = trim($part);
+            }
+        }
+    }
+    /**
+     * If $page_data[$field] is an array, implode it into a string on comma,
+     * and stores the result back into $page_data[$field]
+     *
+     * @param $field the key in $page_data to use
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function toString($field, &$page_data)
+    {
+        $var_field = $this->getVarField($field);
+        if (is_array($page_data[$var_field])) {
+            $page_data[$var_field] = implode(",", $page_data[$var_field]);
+        }
+    }
+    /**
+     * Unsets the key $field (or the crawl constant it corresponds to)
+     * in $page_data. If it is a crawlconstant it doesn't unset it --
+     * it just sets it to the empty string
+     *
+     * @param $field the key in $page_data to use
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function unsetVariable($field, &$page_data)
+    {
+        $var_field = $this->getVarField($field);
+        if ($var_field == $field) {
+            unset($page_data[$var_field]);
+        } else {
+            $page_data[$var_field] = "";
+        }
+    }
+    /**
+     * Write the value of a field to the output folder in the current
+     * format. If the field is not set nothing is written
+     *
+     * @param $field the key in $page_data to use
+     * @param array& $page_data an associative array of containing summary
+     *     info of a web page/record
+     */
+    public function writeOutput($field, &$page_data)
+    {
+        $var_field = $this->getVarField($field);
+        if (isset($page_data[$var_field]) && $this->output_folder) {
+            $data_file = "{$this->output_folder}/data.txt";
+            if (file_exists($data_file) &&
+                filesize($data_file) > C\MAX_LOG_FILE_SIZE) {
+                clearstatcache(); //hopefully, this doesn't slow things too much
+                $data_files = glob("$data_file.*.gz");
+                $num_data_files = count($data_files);
+                file_put_contents("$data_file.$num_data_files.gz",
+                    gzcompress(file_get_contents($data_file)));
+                unlink($data_file);
+            }
+            $out = $page_data[$var_field];
+            if (!$out) {return; }
+            if (!is_array($out)) {
+                $out = [$out];
+            }
+            $fh = fopen($data_file, "a");
+            if (!$fh) {return; }
+            switch ($this->output_format) {
+                case 'csv':
+                    fputcsv($fh, $out);
+                    break;
+                case 'sql':
+                    if (!$this->output_table) {break; }
+                    $sql = "INSERT INTO {$this->output_table} ";
+                    if (isset($out[0])) {
+                        $sql .= " VALUES(";
+                    } else {
+                        $keys = array_keys($out);
+                        $sql .= '(';
+                        foreach ($keys as $key) {
+                            $sql .= "$comma $key";
+                            $comma = ",";
+                        }
+                        $sql .= ') VALUES(';
+                    }
+                    $comma = "";
+                    foreach ($out as $value) {
+                        $sql .= "$comma '". addslashes($value)."'";
+                        $comma = ",";
+                    }
+                    $sql .= ");\n";
+                    fwrite($fh, $sql);
+                    break;
+            }
+            fclose($fh);
+        }
+    }
+}
--- a/src/library/PartialZipArchive.php
+++ b/src/library/PartialZipArchive.php
@ -0,0 +1,155 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * Used to extract files from an initial segment or a fragment of a
+ * ZIP Archive.
+ *
+ * @author Chris Pollett
+ */
+class PartialZipArchive
+{
+    /**
+     * Stores path/filename -> (compression type, compressed file) associations
+     * for all files in the archive that were extractable from the given
+     * zip archive fragment
+     * @var array
+     */
+    public $zip_directory = [];
+    /**
+     * Stores path/filenames that were discovered in the initial segment of
+     * this zip archive
+     * @var array
+     */
+    public $zip_file_names = [];
+    /** ZIP code to indicate compression type is no compression used*/
+    const NO_COMPRESSION = 0;
+    /** ZIP code to indicate compression type is deflate*/
+    const DEFLATE = 8;
+    /** ZIP code to indicate compression type is enhanced deflate (4gb barrier
+     *  passable)
+     */
+    const ENHANCED_DEFLATE = 9;
+    /** Byte string to indicate start of a local file header, used to find
+     *  locations of all the files stored in ZIP fragment we have
+     */
+    const LOCAL_FILE_HEADER = "\x50\x4B\x03\x04";
+    /**
+     * Sets up a PartialZipArchive so that files can be extracted from it.
+     * To this it populates the two field variables @see $zip_directory
+     * and @see $zip_file_names. Offsets used in the code for extracting
+     * various fields out of a zip archive local file header were gotten
+     * from https://en.wikipedia.org/wiki/ZIP_%28file_format%29
+     * Note the code for the constructor justs splits the whole string into
+     * parts on the string @see LOCAL_FILE_HEADER. It doesn't bother to try
+     * to use the zip archive's directory (which might not be in the portion
+     * of this zip archive given). It is possible for a file contained
+     * in archive to actual have within it the string LOCAL_FILE_HEADER, in
+     * which case that file would be screwed up by our approach.
+     *
+     * @param string $zip_string a substring of a zip archive file
+     */
+    public function __construct($zip_string)
+    {
+        $sub_files = explode(self::LOCAL_FILE_HEADER, $zip_string);
+        $sub_files = array_filter($sub_files);
+        $num_sub_files = count($sub_files);
+        foreach ($sub_files as $sub_file) {
+            if (!$sub_file) { continue; }
+            $len_string = substr($sub_file, 22, 2);
+            $file_name_len = (ord($len_string[1]) << 8) + ord($len_string[0]);
+            $len_string = substr($sub_file, 24, 2);
+            $extra_field_len = (ord($len_string[1]) << 8) + ord($len_string[0]);
+            $file_start = 26 + $file_name_len + $extra_field_len;
+            $len_string = substr($sub_file, 14, 4);
+            $file_size = (((((ord($len_string[3]) << 8) +
+                ord($len_string[2])) << 8) + ord($len_string[1])) << 8) +
+                ord($len_string[0]);
+            $file_name = substr($sub_file, 26, $file_name_len);
+            $file_string = substr($sub_file, $file_start, $file_size);
+            if (strlen($file_string) < $file_size) {continue; }
+            $compression = ord($sub_file[4]);
+            if ($file_name && $file_string) {
+                $this->zip_directory[$file_name] = [$compression, $file_string];
+                $this->zip_file_names[] = $file_name;
+            }
+        }
+    }
+    /**
+     * Returns the total number of files that were detected in the zip archive
+     * fragment.
+     *
+     * @return int number of files found in archive
+     */
+    public function numFiles()
+    {
+        return count($this->zip_file_names);
+    }
+    /**
+     * Returns the file name for the ith file that was extractable from
+     * the archive string used in the constructor.
+     *
+     * @param int $index the number of file want
+     * @return string its corresponding file name
+     */
+    public function getNameIndex($index)
+    {
+        if (isset($this->zip_file_names[$index])) {
+            return $this->zip_file_names[$index];
+        }
+        return false;
+    }
+    /**
+     * Returns from the PartialZipArchive the uncompressed contents of
+     * the provided path/filename if found, and false otherwise.
+     *
+     * @param string $file_name contains complete path and file_name of afile
+     * @return mixed uncompressed file contents if found and extractable,
+     *      false otherwise
+     */
+    public function getFromName($file_name)
+    {
+        if (!isset($this->zip_directory[$file_name])) {return false; }
+        list($compression, $file_string) = $this->zip_directory[$file_name];
+        switch ($compression)
+        {
+            case self::NO_COMPRESSION:
+                return $file_string;
+            break;
+            case self::DEFLATE:
+            case self::ENHANCED_DEFLATE:
+                return gzinflate($file_string);
+            break;
+        }
+        return false;
+    }
+}
--- a/src/library/PersistentStructure.php
+++ b/src/library/PersistentStructure.php
@ -0,0 +1,121 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * A PersistentStructure is a data structure which every so many operations
+ * will be saved to secondary storage (such as disk).
+ * An operation occurs whenever the PersistentStructure's checkSave method is
+ * called. A PersistentStructure also supports the ability to be load
+ * (read in from) secondary storage.
+ *
+ * @author Chris Pollett
+ */
+class PersistentStructure
+{
+    /** If not specified in the constructor, this will be the number of
+     * operations between saves
+     * @var int
+     */
+    const DEFAULT_SAVE_FREQUENCY = 50000;
+    /** Name of the file in which to store the PersistentStructure
+     * @var string
+     */
+    public $filename;
+    /** Number of operations since the last save
+     * @var int
+     */
+    public $unsaved_operations;
+    /** Number of operation between saves. If == -1 never save using checkSave
+     * @var int
+     */
+    public $save_frequency;
+
+    /**
+     * Sets up the file name and save frequency for the PersistentStructure,
+     * initializes the oepration count
+     *
+     * @param string $fname the name of the file to store the
+     *     PersistentStructure in
+     * @param int $save_frequency the number of operation before a save If
+     *     <= 0 never check save
+     */
+    public function __construct($fname,
+        $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
+    {
+        $this->filename = $fname;
+        $this->save_frequency = $save_frequency;
+        $this->unsaved_operations = 0;
+    }
+    /**
+     * Load a PersistentStructure from a file
+     *
+     * @param string $fname the name of the file to load the
+     *      PersistentStructure from
+     * @return object the PersistentStructure loaded
+     */
+    public static function load($fname)
+    {
+        /* code to handle the fact that name space of object may not be the
+            modern nameepace
+         */
+        $obj_string = file_get_contents($fname);
+        $name_length = intval(substr($obj_string, 2, 14));
+        $name_space_info_length = strlen("O:".$name_length.":") + $name_length
+            + 2; // 2 for quotes;
+        $actual_name = get_called_class();
+        $obj_string = 'O:' . strlen($actual_name) . ':"'.$actual_name.'"' .
+            substr($obj_string, $name_space_info_length);
+        return unserialize($obj_string);
+    }
+    /**
+     * Save the PersistentStructure to its filename
+     * This method is generic but super memory inefficient, so reimplement
+     * for subclasses is needed
+     */
+    public function save()
+    {
+        file_put_contents($this->filename, serialize($this));
+    }
+    /**
+     * Add one to the unsaved_operations count. If this goes above the
+     * save_frquency then save the PersistentStructure to secondary storage
+     */
+    public function checkSave()
+    {
+        $this->unsaved_operations++;
+        if ($this->save_frequency > 0 &&
+            $this->unsaved_operations >= $this->save_frequency) {
+            $this->save();
+            $this->unsaved_operations = 0;
+        }
+    }
+}
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
--- a/src/library/PriorityQueue.php
+++ b/src/library/PriorityQueue.php
@ -0,0 +1,382 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * Loaded for crawlLog function
+ */
+require_once __DIR__."/Utility.php";
+/**
+ *
+ * Code used to manage a memory efficient priority queue.
+ * Weights for the queue must be flaots. The queue itself is
+ * implemented using heaps
+ *
+ * @author Chris Pollett
+ */
+class PriorityQueue extends StringArray implements CrawlConstants
+{
+    /**
+     * Number of values that can be stored in the priority queue
+     * @var int
+     */
+    public $num_values;
+    /**
+     * Number of bytes needed to store a value associated with a weight
+     * @var int
+     */
+    public $value_size;
+    /**
+     * Number of bytes needed to store a weight in the queue
+     * @var int
+     */
+    public $weight_size = 4; //size of a float
+    /**
+     * Number of items that are currently stored in the queue
+     * @var int
+     */
+    public $count;
+    /**
+     * When the polling the queue returns the least or most weighted value
+     * @var string
+     */
+    public $min_or_max;
+    /**
+     * An object that implements the Notifier interface (for instance,
+     * WebQueueArchive)
+     * @var object
+     */
+    public $notifier; // who to call if move an item in queue
+    /**
+     * Makes a priority queue (implemented as an array heap) with the given
+     * operating parameters
+     *
+     * @param string $fname filename to store the data associated with the queue
+     * @param int $num_values number of values the queue can hold
+     * @param int $value_size the size in a bytes of a value
+     * @param string $min_or_max whether this priority queue return least or
+     * most weight values when polled
+     * @param object $notifier object to call when a value changes in the queue
+     * @param int $save_frequency how often the data in the queue should be
+     *     save to disk. (It's default location is RAM)
+     */
+    public function __construct($fname, $num_values, $value_size,
+        $min_or_max, $notifier = null,
+        $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
+    {
+        $this->num_values = $num_values;
+        $this->value_size = $value_size;
+        $this->min_or_max = $min_or_max;
+        $this->count = 0;
+        $this->notifier = $notifier;
+        parent::__construct($fname, $num_values,
+            $value_size + $this->weight_size, $save_frequency);
+    }
+    /**
+     * Gets the data stored at the ith location in the priority queue
+     *
+     * @param int $i location to return data from
+     * @return mixed array data if the value of $i is between 1 and count, false
+     *     otherwise
+     */
+    public function peek($i = 1)
+    {
+        if ($i < 1 || $i > $this->count) {
+            crawlLog("Peek Index $i not in Range [1, {$this->count}]");
+            return false;
+        }
+        return $this->getRow($i);
+    }
+    /**
+     * Removes and returns the ith element out of the Priority queue.
+     * Since this is a priority queue the first element in the queue
+     * will either be the min or max (depending on queue type) element
+     * stored. If $i is not in range an error message is written to the log.
+     * This operation also performs a check to see if the queue should be
+     * saved to disk
+     *
+     * @param int $i element to get out of the queue
+     * @return mixed array data if the value of $i is between 1 and count, false
+     *     otherwise
+     */
+    public function poll($i = 1)
+    {
+        if ($i < 1 || $i > $this->count) {
+            crawlLog("Index $i not in Range [1, {$this->count}]");
+            return false;
+        }
+        $extreme = $this->peek($i);
+        $last_entry = $this->getRow($this->count);
+        $this->putRow($i, $last_entry);
+        $this->count--;
+        $this->percolateDown($i);
+        $this->checkSave();
+        return $extreme;
+    }
+    /**
+     * Inserts a new item into the priority queue.
+     *
+     * @param string $data what to insert into the queue
+     * @param float $weight how much the new data should be weighted
+     * @return mixed index location in queue where item was stored if
+     *     successful, otherwise false.
+     */
+    public function insert($data, $weight)
+    {
+        if ($this->count == $this->num_values) {
+            return false;
+        }
+        $this->count++;
+        $cur = $this->count;
+        $this->putRow($cur, [$data, $weight]);
+        $loc = $this->percolateUp($cur);
+        return $loc;
+    }
+    /**
+     * Add $delta to the $ith element in the priority queue and then adjusts
+     * the queue to store the heap property
+     *
+     * @param int $i element whose weight should be adjusted
+     * @param float $delta how much to change the weight by
+     */
+    public function adjustWeight($i, $delta)
+    {
+        if ( ($tmp = $this->peek($i)) === false) {
+            crawlLog("Index $i not in queue adjust weight failed");
+            return false;
+        }
+        list($data, $old_weight) = $tmp;
+        $new_weight = $old_weight + $delta;
+        $this->putRow($i, [$data, $new_weight]);
+        if ($new_weight > $old_weight) {
+            if ($this->min_or_max == self::MIN) {
+                $this->percolateDown($i);
+            } else {
+                $this->percolateUp($i);
+            }
+        } else {
+            if ($this->min_or_max == self::MAX) {
+                $this->percolateDown($i);
+            } else {
+                $this->percolateUp($i);
+            }
+        }
+    }
+    /**
+     * Pretty prints the contents of the queue viewed as an array.
+     *
+     */
+    public function printContents()
+    {
+        for ($i = 1; $i <= $this->count; $i++) {
+            $row = $this->peek($i);
+            print "Entry: $i Value: ".$row[0]." Weight: ".$row[1]."\n";
+        }
+    }
+    /**
+     * Return the contents of the priority queue as an array of
+     * value weight pairs.
+     *
+     * @return array contents of the queue
+     */
+    public function getContents()
+    {
+        $rows = [];
+        for ($i = 1; $i <= $this->count; $i++) {
+            $rows[] = $this->peek($i);
+        }
+        return $rows;
+    }
+    /**
+     * Scaless the weights of elements in the queue so that the sum fo the new
+     * weights is $new_total
+     *
+     * This function is used periodically to prevent the queue from being
+     * gummed up because all of the weights stored in it are too small.
+     *
+     * @param int $new_total what the new sum of weights of elements in the
+     *     queue will be after normalization
+     */
+    public function normalize($new_total = C\NUM_URLS_QUEUE_RAM)
+    {
+        $count = $this->count;
+        $total_weight = $this->totalWeight();
+
+        if ($total_weight <= 0) {
+            crawlLog(
+                "Total queue weight was zero!! Doing uniform renormalization!");
+        }
+        for ($i = 1; $i <= $count; $i++) {
+            $row = $this->getRow($i);
+            if ($total_weight > 0) {
+                $row[1] = ($new_total*$row[1])/$total_weight;
+            } else {
+                $row[1] = $new_total/$count;
+            }
+            $this->putRow($i, $row);
+        }
+    }
+    /**
+     * If the $ith element in the PriorityQueue violates the heap
+     * property with its parent node (children should be of lower
+     * priority than the parent), this function
+     * tries modify the heap to restore the heap property.
+     *
+     * @param int $i node to consider in restoring the heap property
+     * @return int final position $ith node ends up at
+     */
+    public function percolateUp($i)
+    {
+        if ($i <= 1) return $i;
+        $start_row = $this->getRow($i);
+        $parent = $i;
+        while ($parent > 1) {
+            $child = $parent;
+            $parent = floor($parent/2);
+            $row = $this->getRow($parent);
+            if ($this->compare($row[1], $start_row[1]) < 0) {
+                $this->putRow($child, $row);
+            } else {
+                $this->putRow($child, $start_row);
+                return $child;
+            }
+        }
+        $this->putRow(1, $start_row);
+        return 1;
+    }
+    /**
+     * If the ith element in the PriorityQueue violates the heap
+     * property with some child node (children should be of lower
+     * priority than the parent), this function
+     * tries modify the heap to restore the heap property.
+     *
+     * @param int $i node to consider in restoring the heap property
+     */
+    public function percolateDown($i)
+    {
+        $start_row = $this->getRow($i);
+        $count = $this->count;
+        $parent = $i;
+        $child = 2*$parent;
+        while ($child <= $count) {
+            $left_child_row = $this->getRow($child);
+            if ($child < $count) { // this 'if' checks if there is a right child
+                $right_child_row = $this->getRow($child + 1);
+                if ($this->compare(
+                    $left_child_row[1], $right_child_row[1]) < 0) {
+                    $child++;
+                }
+            }
+            $child_row = $this->getRow($child);
+            if ($this->compare($start_row[1], $child_row[1]) < 0) {
+                $this->putRow($parent, $child_row);
+            } else {
+                $this->putRow($parent, $start_row);
+                return;
+            }
+            $parent = $child;
+            $child = 2 * $parent;
+        }
+        $this->putRow($parent, $start_row);
+    }
+    /**
+     * Computes the difference of the two values $value1 and $value2
+     *
+     * Which is subtracted from which is determined by whether this is
+     * a min_or_max priority queue
+     *
+     * @param float $value1 a value to take the difference between
+     * @param float $value2 the other value
+     * @return float the differences
+     */
+    public function compare($value1, $value2)
+    {
+      if ($this->min_or_max == self::MIN) {
+         return $value2 - $value1;
+      } else {
+         return $value1 - $value2;
+      }
+    }
+    /**
+     * Gets the ith element of the PriorityQueue viewed as an array
+     *
+     * @param int $i element to get
+     * @return array value stored in queue together with its weight as a two
+     *     element array
+     */
+    public function getRow($i)
+    {
+        $value_size = $this->value_size;
+        $weight_size = $this->weight_size;
+        $row = $this->get($i);
+        $value = substr($row, 0, $value_size);
+        $pre_weight = substr($row, $value_size, $weight_size);
+        $weight_array = unpack("f", $pre_weight);
+        $weight = $weight_array[1];
+        return [$value, $weight];
+    }
+    /**
+     * Add data to the $i row of the priority queue viewed as an array
+     * Calls the notifier associated with this queue about the change
+     * in data's location
+     *
+     * @param int $i location to add data
+     * @param array $row data to add (a two element array in the form
+     *     key, float value).
+     */
+    public function putRow($i, $row)
+    {
+        $raw_data = $row[0].pack("f", $row[1]);
+        $this->put($i, $raw_data);
+        if ($this->notifier != null) {
+            $this->notifier->notify($i, $row);
+        }
+    }
+    /**
+     * Computes and returns the weight of all items in prority queue
+     *
+     * @return float weight of all items stored in the priority queue
+     */
+    public function totalWeight()
+    {
+        $count = $this->count;
+        $total_weight = 0;
+        for ($i = 1; $i <= $count; $i++) {
+            $row = $this->getRow($i);
+            $total_weight += $row[1];
+        }
+        return $total_weight;
+    }
+
+
+}
--- a/src/library/ScraperManager.php
+++ b/src/library/ScraperManager.php
@ -0,0 +1,185 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Charles Bocage (charles.bocage@sjsu.edu)
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+/**
+ * Class used by html processors to detect if a page matches a particular
+ * signature such as that of a content management system, and
+ * also to provide scraping mechanisms for the content of such a page
+ *
+ * @author Charles Bocage (charles.bocage@sjsu.edu)
+ */
+class ScraperManager
+{
+    /**
+     * Method used to check a page against a supplied list of scrapers
+     * for a matching signature. If a match is found that scraper is returned.
+     *
+     * @param string $page the html page to check
+     * @param array $scrapers an array of scrapers to check against
+     * @return array an associative array of scraper properties if a matching
+     *      scraper signature found; otherwise, the empty array
+     */
+    public static function getScraper($page, $scrapers)
+    {
+        $out_scraper = [];
+        foreach ($scrapers as $scraper) {
+            if (empty($scraper)) {
+                continue;
+            }
+            $signature = html_entity_decode(
+                $scraper['SIGNATURE'], ENT_QUOTES);
+            if (self::checkSignature($page, $signature)) {
+                $out_scraper['SIGNATURE'] = $signature;
+                $out_scraper['ID'] = $scraper['ID'];
+                $out_scraper['SCRAPE_RULES'] = html_entity_decode(
+                    $scraper['SCRAPE_RULES'], ENT_QUOTES);
+                $out_scraper['NAME'] = $scraper['NAME'];
+                break;
+            }
+        }
+        return $out_scraper;
+    }
+    /**
+     * Applies scrape rules to a given page. A scrape rule consists of
+     * a sequence of xpaths delimited by ###. The first path is used
+     * extract content from the page, the remaining xpaths are used
+     * to delete content from the result.
+     *
+     * @param string $page the html page to operate on
+     * @param string $scrape_rules_string a string of xpaths with ###
+     *  used as a delimeter
+     * @return string the result of extracting first xpath content and
+     *  deleting from it according to the remaining xpath rules
+     */
+    public static function applyScraperRules($page, $scrape_rules_string)
+    {
+        $scrape_rules = preg_split('/###/u',
+            $scrape_rules_string, 0, PREG_SPLIT_NO_EMPTY);
+        if (count($scrape_rules) > 0) {
+            $temp_page = self::getContentByXquery($page,
+                $scrape_rules[0]);
+            unset($scrape_rules[0]);
+            if (!empty($temp_page)) {
+                foreach ($scrape_rules as $tag_to_remove) {
+                    $new_temp_page =
+                        self::removeContentByXquery($temp_page, $tag_to_remove);
+                    if (!empty($new_temp_page)) {
+                        $temp_page = $new_temp_page;
+                    }
+                }
+            }
+        }
+        return empty($temp_page) ? $page : $temp_page;
+    }
+    /**
+     * If $signature begins with '/', checks to see if applying
+     * the xpath in $signature to $page results
+     * in a non-empty dom node list. Otherwise, does a match of the
+     * regex (without matching start and end delimiters (say, /)
+     * against $page and returns whether found
+     *
+     * @param string $page a web document to check
+     * @param string $signature an xpath to check against
+     * @return boolean true if the given xpath return a non empty dom node list
+     */
+    public static function checkSignature($page, $signature)
+    {
+        if ($signature[0] == '/') {
+            $dom = new \DOMDocument();
+            $results = false;
+            restore_error_handler();
+            if (@$dom->loadHTML($page)) {
+                if ($xpath = new \DOMXpath($dom)) {
+                    $results = $xpath->query($signature);
+                }
+            }
+            set_error_handler(C\NS_LIB . "yioop_error_handler");
+            return !empty($results->length) && $results->length > 0;
+        } else {
+            return (mb_ereg($signature, $page) !== false);
+        }
+    }
+    /**
+     * Get the contents of a document via an xpath
+     * @param string $page a document to apply the xpath query against
+     * @param string $query the xpath query to run
+     *
+     * @return string the content found as a string, otherwise an empty string
+     */
+    public static function getContentByXquery($page, $query)
+    {
+        $result = "";
+        $dom = new \DOMDocument();
+        restore_error_handler();
+        if (@$dom->loadHTML($page)) {
+            $xpath = new \DOMXPath($dom);
+            $xpath_result = $xpath->query($query);
+            if (!empty($xpath_result) && $xpath_result->length > 0) {
+                $result = $dom->saveHTML($xpath_result->item(0));
+            }
+        }
+        set_error_handler(C\NS_LIB . "yioop_error_handler");
+        return $result;
+    }
+    /**
+     * Removes from the contents of a document the results of
+     * an xpath query
+     * @param string $page a document to apply the xpath query against
+     * @param string $query the xpath query to run
+     *
+     * @return string the content less the xpath results as an HTML document
+     */
+    public static function removeContentByXquery($page, $query)
+    {
+        $result = $page;
+        $dom = new \DOMDocument();
+        restore_error_handler();
+        if (@$dom->loadHTML($page)) {
+            $xpath = new \DOMXPath($dom);
+            $xpath_result = $xpath->query($query);
+            if ($xpath_result->length > 0) {
+                $len = $xpath_result->length;
+                for ($i = 0; $i < $len; $i++) {
+                    $node = $xpath_result->item($i);
+                    $parent = $node->parentNode;
+                    if ($parent) {
+                        $parent->removeChild($node);
+                    }
+                }
+                $result = $dom->saveHTML();
+            }
+        }
+        set_error_handler(C\NS_LIB . "yioop_error_handler");
+        return $result;
+    }
+}
--- a/src/library/StringArray.php
+++ b/src/library/StringArray.php
@ -0,0 +1,143 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * Load charCopy
+ */
+require_once __DIR__."/Utility.php";
+/**
+ * Memory efficient implementation of persistent arrays
+ *
+ * The standard array objects in php and even spl have a large amount of
+ * overhead. The point of this class is to have the size as close to the
+ * optimal as possible
+ *
+ * @author Chris Pollett
+ */
+class StringArray extends PersistentStructure
+{
+    /**
+     * Number of items to be stored in the StringArray
+     * @var int
+     */
+    public $num_values;
+    /**
+     * Size of each item in bytes to be stored
+     * @var int
+     */
+    public $data_size;
+    /**
+     * Number of bytes of storage need by the string array
+     * @var int
+     */
+    public $string_array_size;
+    /**
+     * Character string used to store the packed data of the StringArray
+     * @var string
+     */
+    public $string_array;
+    /**
+     * Initiliazes the fields of the StringArray and its parent class
+     * PersistentStructure. Creates a null filled string array of size
+     * $this->string_array_size to stored data in.
+     *
+     * @param string $fname the name of the file to store data persistently in
+     * @param int $num_values the number of items the StringArray will store
+     * @param int $data_size the size in bytes of a single item
+     * @param int $save_frequency how often the StringArray should be stored to
+     *     disk
+     */
+    public function __construct($fname, $num_values, $data_size,
+        $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
+    {
+        $this->num_values = $num_values;
+        $this->data_size = $data_size;
+        $this->string_array_size = $num_values * ($data_size);
+        $this->string_array = pack("x". $this->string_array_size);
+        parent::__construct($fname, $save_frequency);
+    }
+    /**
+     * Load a StringArray from a file
+     *
+     * @param string $fname the name of the file to load the StringArray from
+     * @return object the PersistentStructure loaded
+     */
+    public static function load($fname)
+    {
+        $fh = fopen($fname, "rb");
+        $array_size = unpackInt(fread($fh, 4));
+        $array = fread($fh, $array_size);
+        $object = unserialize(fread($fh,
+            filesize($fname) -4 - $array_size));
+        $object->string_array = & $array;
+        fclose($fh);
+        return $object;
+    }
+    /**
+     * Save the StringArray to its filename
+     */
+    public function save()
+    {
+        $fh = fopen($this->filename, "wb");
+        $tmp = & $this->string_array;
+        fwrite($fh, packInt($this->string_array_size));
+        fwrite($fh, $this->string_array);
+        unset($this->string_array);
+        fwrite($fh, serialize($this));
+        $this->string_array = & $tmp;
+        fclose($fh);
+    }
+    /**
+     * Looks up the ith item in the StringArray
+     *
+     * @param int $i array index of item to look up
+     * @return string the looked-up item of length $this->data_size
+     */
+    public function get($i)
+    {
+        $data_size = $this->data_size;
+        return substr($this->string_array, $i * $data_size, $data_size);
+    }
+    /**
+     * Puts data into the ith item of the StringArray
+     *
+     * @param int $i array index of where to store data
+     * @param string $data at least $this->data_size many bytes of data to
+     *     store
+     */
+    public function put($i, $data)
+    {
+        $data_size = $this->data_size;
+        $start = $i * $data_size;
+        charCopy($data, $this->string_array, $start, $data_size);
+    }
+}
--- a/src/library/SuffixTree.php
+++ b/src/library/SuffixTree.php
@ -0,0 +1,351 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Data structure used to maintain a suffix tree for a passage of words.
+ * The suffix tree is constructed using the linear time algorithm of
+ * Ukkonen, E. (1995). "On-line construction of suffix trees".
+ * Algorithmica 14 (3): 249–260.
+ *
+ * @author Chris Pollett
+ */
+class SuffixTree
+{
+    /**
+     * The root node of the suffix trees
+     * @var array
+     */
+    public $root;
+    /**
+     * Index of last node added to the suffix tree in the array used to
+     * hold the suffix tree data structures
+     * @var int
+     */
+    public $last_added;
+    /**
+     * Position in the $this->text up to which we have created a suffix tree
+     * so far
+     * @var int
+     */
+    public $pos;
+    /**
+     * If in a given step in constructing the suffix tree we split the
+     * active edge and insert a new node and then have to do this
+     * again in the same step, then we need to create a sym_link between
+     * the suffix trees represented by these new nodes. This variable
+     * keeps track of the index of the first node so we can do this.
+     *
+     * @var int
+     */
+    public $need_sym_link;
+
+    /**
+     * At a given stage in building the suffix tree how many new suffixes
+     * we need to insert
+     * @var int
+     */
+    public $remainder;
+    /**
+     * Node which represents the left hand the start of the active edge
+     * This is the edge that contains the last suffix inserted
+     * @var int
+     */
+    public $active_index;
+    /**
+     * Index into $this->text of starting word of active edge
+     * @var int
+     */
+    public $active_edge_index;
+    /**
+     * How many words from the start of the active edge label to get the
+     * last suffix. If active edge label was: "a black cat a black" and
+     * $active_len was 2, then would have "a black" from the first two chars.
+     * @var int
+     */
+    public $active_len;
+    /**
+     * Number of elements in $this->text. i.e., count($this->text)
+     * @var int
+     */
+    public $size;
+
+    /**
+     * The sequence of terms, one array entry per term, that a suffix tree is
+     * to be made from
+     * @var array
+     */
+    public $text;
+    /**
+     * Used to hold the suffix tree data structure (represented as a sequence
+     * of nodes)
+     * @var array
+     */
+    public $tree;
+    /**
+     * Upper bound on the length of any path in the tree
+     */
+    const INFTY = 2000000000;
+    /**
+     * Initializes a suffix tree based on the supplied array of terms.
+     *
+     * @param array $text a sequence of terms to build the suffix tree for
+     */
+    public function __construct($text)
+    {
+        $this->text = $text;
+        $this->size = count($text);
+        $this->buildTree();
+    }
+    /**
+     * Builds the complete suffix tree for the text currently stored in
+     * $this->text. If you change this text and call this method again,
+     * it build a new tree based on the new text. Uses Ukkonen
+     */
+    public function buildTree()
+    {
+        $this->tree = [];
+        $this->need_sym_link = 0;
+        $this->last_added = 0;
+        $this->pos = -1;
+        $this->remainder = 0;
+        $this->active_edge_index = 0;
+        $this->active_len = 0;
+        $this->root = $this->makeNode(-1, -1);
+        $this->active_index = $this->root;
+        $num_terms = count($this->text);
+        for ($i = 0; $i < $num_terms; $i++) {
+            $this->suffixTreeExtend();
+        }
+    }
+    /**
+     * Makes a new node for the suffix tree structure. This node
+     * is inserted at the end of the tree so far. A node is associative
+     * array consisting of the fields "start" whose value
+     * is the starting location in $this->text for this node,
+     * "end" location in $this->text up to which this node is
+     * responsible, "sym_link" is a link to an isomorphic subtree for the
+     * purposes of building the suffix tree, and "next" is an array of
+     * next children in the tree.
+     *
+     * @param int $start what to use as the start value mentioned above
+     * @param int $end what to use as the start value mentioned above
+     */
+    public function makeNode($start, $end = self::INFTY)
+    {
+        $node = [];
+        $node["start"] = $start;
+        $node["end"]  = $end;
+        $node["sym_link"] = 0;
+        $node["next"] = [];
+        $this->tree[++$this->last_added] = $node;
+        return $this->last_added;
+    }
+    /**
+     * The number of elements out of $this->text that this node is currently
+     * responsible for
+     *
+     * @param array& $node the node to compute the length of
+     */
+    public function edgeLength(&$node)
+    {
+        return min($node["end"], $this->pos + 1) - $node["start"];
+    }
+    /**
+     * If in a given step in constructing the suffix tree we split the
+     * active edge and insert a new node and then have to do this
+     * again in the same step, then we need to create a sym_link between
+     * the suffix trees represented by these new nodes. If in the current
+     * step it is necessary to add a sym_link this method sets the
+     * $this->need_sym_link node's "sym_link" field to $index which is supposed
+     * be the index of the second created node.
+     *
+     * @param int $index the index of the a created node in a given step.
+     *     ($this->need_sym_link will be greater than 0 if it is the second
+     *     created node of the step)
+     */
+    public function addSuffixLink($index)
+    {
+        if ($this->need_sym_link > 0) {
+            $this->tree[$this->need_sym_link]["sym_link"] = $index;
+        }
+        $this->need_sym_link = $index;
+    }
+    /**
+     * Used to set the active point to the node given by $index
+     *
+     * @param int $index which node to use for setting
+     * @return if the current active edge is longer than $index's edge length
+     *     then don't update and return false; otherwise, return true
+     */
+    public function walkDown($index)
+    {
+        $edge_length = $this->edgeLength($this->tree[$index]);
+        if ($this->active_len >= $edge_length) {
+            $this->active_edge_index += $edge_length;
+            $this->active_len -= $edge_length;
+            $this->active_index = $index;
+            return true;
+        }
+        return false;
+    }
+    /**
+     * Given a suffix tree of the array of terms in $this->text up to
+     * $this->pos, adds one to pos and build the suffix tree up to this
+     * new value. i.e., the text with one more term added.
+     */
+    public function suffixTreeExtend()
+    {
+        $this->pos++;
+        $term = $this->text[$this->pos];
+        $this->need_sym_link = -1;
+        $this->remainder++;
+        if (!isset($this->text[$this->active_edge_index])) {
+            return;
+        }
+        while($this->remainder>0 && isset($this->text[$this->active_edge_index])
+            && isset($this->text[$this->pos]) ) {
+            if ($this->active_len == 0) {
+                $this->active_edge_index = $this->pos;
+            }
+            $active_term = $this->text[$this->active_edge_index];
+            if (!isset($this->tree[$this->active_index]["next"][$active_term])){
+                $leaf = $this->makeNode($this->pos);
+                $this->tree[$this->active_index]["next"][$active_term] = $leaf;
+                $this->addSuffixLink($this->active_index); //rule 2
+            } else {
+                $next = $this->tree[$this->active_index]["next"][$active_term];
+                if ($this->walkDown($next)) {
+                    continue; //observation 2
+                }
+                $start = $this->tree[$next]["start"];
+                if ($this->text[$start + $this->active_len] == $term) {
+                    //observation 1
+                    $this->active_len++;
+                    $this->addSuffixLink($this->active_index); //observation 3
+                    break;
+                }
+                $splitNode = $this->makeNode($start, $start+$this->active_len);
+                $active_term = $this->text[$this->active_edge_index];
+                $this->tree[$this->active_index]["next"][$active_term] =
+                    $splitNode;
+                $leaf = $this->makeNode($this->pos);
+                $this->tree[$splitNode]["next"][$term] = $leaf;
+                $this->tree[$next]["start"] += $this->active_len;
+                $this->tree[$splitNode]["next"][
+                    $this->text[$this->tree[$next]["start"]]] = $next;
+                $this->addSuffixLink($splitNode); //rule 2
+            }
+            $this->remainder--;
+            if ($this->active_index == $this->root && $this->active_len > 0) {
+                //rule 1
+                $this->active_len--;
+                $this->active_edge_index = $this->pos - $this->remainder + 1;
+            } else {
+                $this->active_index =
+                    ($this->tree[$this->active_index]["sym_link"] > 0 ) ?
+                    $this->tree[$this->active_index]["sym_link"] : $this->root;
+                    //rule 3
+            }
+        }
+    }
+    /**
+     * Recursive function used to compute the maximal phrases in a document
+     * as well as their conditional maximal subphrases.
+     *
+     * @param int $index a node in the suffix tree
+     * @param string $path from root to current node
+     * @param int $len number of nodes from root to current node in suffix tree
+     * @param array& $maximal assoc array of phrase => (cond_max => pos of
+     *     conditional maximal subphrase, [0] => pos_1st_occurrence of phrase,
+     *     [1]=>pos_2nd_occurrence of phrase, etc)
+     */
+    public function outputMaximal($index, $path, $len, &$maximal)
+    {
+        $start = $this->tree[$index]["start"];
+        $end = $this->tree[$index]["end"];
+        if ($start >= 0 && $end >= 0) {
+            $tmp_terms = array_slice($this->text, $start, $end - $start);
+            $tmp = implode(" ", $tmp_terms);
+            $num = count($tmp_terms);
+            if ($path != "") {
+                $begin = $start - $len;
+                $out_path = $path;
+                if ($len > C\MAX_QUERY_TERMS) {
+                    $out_path = implode(" ", array_slice($this->text, $begin,
+                        C\MAX_QUERY_TERMS));
+                }
+                $maximal[$out_path][] = $begin;
+                if (!isset($maximal[$out_path]["cond_max"])) {
+                    $maximal[$out_path]["cond_max"] =
+                        strpos($out_path, " ") + 1;
+                }
+                if ($len > 1 && $len < C\MAX_QUERY_TERMS) {
+                    $cond_max = strlen($path) + 1;
+                }
+                $path .= " ".$tmp;
+                $len += $num;
+                if (isset($cond_max)) {
+                    $out_path = $path;
+                    if ($len > C\MAX_QUERY_TERMS) {
+                        $out_path = implode(" ", array_slice($this->text,
+                            $begin, C\MAX_QUERY_TERMS));
+                    }
+                    $maximal[$out_path]["cond_max"] = $cond_max;
+                }
+            } else {
+                $len = $num;
+                $path = $tmp;
+            }
+        }
+        if ($end == self::INFTY) {
+            $begin = $this->size - $len;
+            $out_path = $path;
+            if ($len > C\MAX_QUERY_TERMS) {
+                $out_path = implode(" ", array_slice($this->text, $begin,
+                    C\MAX_QUERY_TERMS));
+            }
+            $maximal[$out_path][] = $begin;
+            if (!isset($maximal[$out_path]["cond_max"])) {
+                $maximal[$out_path]["cond_max"] =
+                    strpos($out_path, " ") + 1;
+            }
+            return;
+        }
+        foreach ($this->tree[$index]["next"] as $sub_index) {
+            $this->outputMaximal($sub_index, $path, $len, $maximal);
+        }
+    }
+}
--- a/src/library/Thesaurus.php
+++ b/src/library/Thesaurus.php
@ -0,0 +1,361 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Shailesh Padave shaileshpadave49@gmail.com
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Class used to reorder the last 10 links computed by PhraseModel based on
+ * thesaurus semantic information. For English, thesaurus semantic information
+ * can be provided by WordNet, a lexical English database
+ * available at http://wordnet.princeton.edu/
+ * To enable, you this have to define WORDNET_EXEC in your local_config file.
+ * The idea behind thresaurus reordering is that given a query, it
+ * is tagged for parts of speech. Each term is then looked up in thesaurus for
+ * those parts of speech. Representative phrases for those term senses are
+ * extracted from the ranked thesaurus output and a set of rewrites of the
+ * original query are created. By looking up the number
+ * of times these rewrites occur in the searched index the top two phrases
+ * that represent the original query are computed.The BM25 similarity of these
+ * phrases is then scored against each of the 10 output summaries of
+ * PhraseModel and used to reorder the results.
+ * To add thesaurus reordering for a different locale, two methods need to be
+ * written in that locale tokenizer.php file
+ * tagPartsOfSpeechPhrase($phrase) which on an input phrase return a string
+ *     where each term_i in the phrase has been replace with term_i~pos
+ *     where pos is a two character part of speech NN, VB, AJ, AV, or NA (if
+ *     none of the previous apply)
+ * scoredThesaurusMatches($term, $word_type, $whole_query) which takes
+ *     a term from an original whole_query which has been tagged to be
+ *     one of the types VB (for verb), NN (for noun), AJ (for adjective),
+ *     AV (for adverb), or NA (for anything else), it outputs
+ *     a sequence of  (score => array of thesaurus terms) associations.
+ *     The score representing one word sense of term
+ * Given that these methods have been implemented if the use_thesaurus field
+ * of that language tokenizer is set to true, the thesaurus will be used.
+ */
+class Thesaurus
+{
+    /**
+     * Extracts similar phrases to the input query using thesaurus results.
+     * Part of speech tagging is processed on input and the output is
+     * looked up in the thesaurus. USing this a ranked list of alternate
+     * query phrases is created.
+     * For those phrases, counts in the Yioop index are calculated
+     * and the top two phrases are selected.
+     * @param string $orig_query input query from user
+     * @param string $index_name selected index for search engine
+     * @param string $lang locale tag for the query
+     * @param integer $threshold once count in posting list for any word
+     *     reaches to threshold then return the number
+     * @return array of top two words
+     */
+    public static function getSimilarPhrases($orig_query, $index_name,
+        $lang, $threshold = 10)
+    {
+        $num_docs = [];
+        $scores = [];
+
+        $suggested_queries =
+            self::getInitialSuggestions($orig_query, $lang);
+        foreach ($suggested_queries as $suggestion) {
+            $num_docs[$suggestion] =
+                self::numDocsIndex($suggestion, $threshold, $index_name, $lang);
+        }
+        arsort($num_docs);
+        $result = [];
+        $i = 0;
+        foreach ($num_docs as $k => $v) {
+            $result[$i] = $k;
+            $i++;
+            if ($i >= 2) { break; }
+        }
+        return $result;
+    }
+    /**
+     * Gets array of BM25 scores for given input array of summaries
+     * and thesaurus generated queries
+     * @param array $similar_phrases an array of thesaurus generated queries
+     * @param array $summaries an array of summaries which is generated
+     *     during crawl time.
+     * @return array of BM25 score for each document based on the thesaurus
+     * simimar phrases
+     */
+    public static function scorePhrasesSummaries($similar_phrases, $summaries)
+    {
+        $score = [];
+        //if there are no similar words then
+        if (empty($similar_phrases)) {
+            return [];
+        } else {
+            $num_phrases = count($similar_phrases);
+            for ($i = 0; $i < $num_phrases; $i++) {
+                $phrase = $similar_phrases[$i];
+                $terms = explode(' ', $phrase);
+                $summaries = self::changeCaseOfStringArray($summaries);
+                $idf = self::calculateIDF($summaries, $terms);
+                $tf = self::calculateTFBM25($summaries, $terms);
+                $num_summaries = count($summaries);
+                $num_terms = count($terms);
+                $bm25_result[$i] =
+                    self::calculateBM25($idf, $tf, $num_terms, $num_summaries);
+            }
+            if (count($bm25_result) == 1) {
+                for ($i = 0; $i < $num_summaries; $i++) {
+                    $temp = 0;
+                    $temp = $bm25_result[0][$i];
+                    $score[$i] = $temp;
+                }
+            } else {
+                for ($i = 0; $i < $num_summaries; $i++) {
+                    $temp = 0;
+                    $temp = $bm25_result[0][$i] * (2/3) +
+                        $bm25_result[1][$i] * (1/3);
+                    $score[$i] = $temp;
+                }
+            }
+            return $score;
+        }
+    }
+    /**
+     * Computes suggested related phrases from thesaurus based on part of
+     * speech  done on each query term.
+     *
+     * @param string $query query entered by user
+     * @param string $lang locale tag for the query
+     * @return string array $suggestion consisting of phrases suggested to
+     *     be similar in meaning to some sens of the query
+     */
+    public static function getInitialSuggestions($query, $lang)
+    {
+        $tokenizer = PhraseParser::getTokenizer($lang);
+        $pos_query = $tokenizer->tagPartsOfSpeechPhrase($query);
+        $max_len = 25;
+        $replacement_phrases = [];
+        $suggestions = [];
+        $terms = preg_split("/\s+|\-/", trim($query));
+        $pos_terms = preg_split("/\s+/",
+            trim($pos_query), -1, PREG_SPLIT_NO_EMPTY);
+        $num_pos_terms = count($pos_terms);
+        $word_type = null;
+        $similar_words = [];
+        $known_word_types = ["NN", "VB", "AJ", "AV"];
+        for ($i = 0; $i < $num_pos_terms; $i++) {
+            $pos = strpos($pos_terms[$i], '~');
+            $word_type = trim(substr($pos_terms[$i], $pos + 1));
+            if (!in_array($word_type, $known_word_types)) {
+                $word_type = "NA";
+            }
+            $current_word = substr($pos_terms[$i], 0, $pos);
+            if ($word_type != "NA") {
+                $similar_phrases = $tokenizer->scoredThesaurusMatches(
+                    $current_word, $word_type, $query);
+                $highest_scoring_sense_phrases = ($similar_phrases) ?
+                    array_shift($similar_phrases): false;
+                if ($highest_scoring_sense_phrases) {
+                    $replacement_phrases[$current_word] =
+                        $highest_scoring_sense_phrases;
+                }
+            }
+        }
+        $i = 0;
+        foreach ($replacement_phrases as $words => $similar_phrases) {
+            foreach ($similar_phrases as $phrase) {
+                if (mb_strpos(trim($phrase), ' ') !== false) {
+                    $phrase = preg_replace('/~[\w]+/', '', $phrase);
+                }
+                $modified_query = preg_replace(
+                    '/' . $words . '/', trim($phrase), $query);
+                if (mb_strlen($modified_query) < $max_len &&
+                    mb_strpos($modified_query, $query) === false) {
+                    $suggestions[$i] = $modified_query;
+                    $i++;
+                }
+            }
+        }
+        return $suggestions;
+    }
+    /**
+     * Returns the number of documents in an index that a phrase occurs in.
+     * If it occurs in more than threshold documents then cut off search.
+     *
+     * @param string $phrase to look up in index
+     * @param int $threshold once count in posting list for any word
+     *     reaches to threshold then return the number
+     * @param string $index_name selected index for search engine
+     * @param string $lang locale tag for the query
+     * @return int number of documents phrase occurs in
+     */
+    public static function numDocsIndex($phrase, $threshold, $index_name, $lang)
+    {
+        PhraseParser::canonicalizePunctuatedTerms($phrase, $lang);
+        $terms = PhraseParser::stemCharGramSegment($phrase, $lang);
+        $num  = count($terms);
+        if ($index_name == null) {
+            return 0;
+        }
+        if (count($terms) > C\MAX_QUERY_TERMS) {
+            $terms  = array_slice($terms, 0, C\MAX_QUERY_TERMS);
+        }
+        $whole_phrase = implode(" ", $terms);
+        return IndexManager::numDocsTerm($whole_phrase, $index_name,
+            $threshold);
+    }
+    /**
+     * Lower cases an array of strings
+     *
+     * @param array $summaries strings to put into lower case
+     * @return array with strings converted to lower case
+     */
+    public static function changeCaseOfStringArray($summaries)
+    {
+        return explode("-!-", mb_strtolower(implode("-!-", $summaries)));
+    }
+    /**
+     * Computes the BM25 of an array of documents given that the idf and
+     * tf scores for these documents have already been computed
+     *
+     * @param array $idf inverse doc frequency for given query array
+     * @param array $tf term frequency for given query array
+     * @param $num_terms number of terms that make up input query
+     * @param $num_summaries count for input summaries
+     * @returns array consisting of BM25 scores for each document
+     */
+    public static function calculateBM25($idf, $tf, $num_terms, $num_summaries)
+    {
+        $scores = [];
+        for ($i = 0; $i < $num_terms; $i++) {
+            for ($j = 0; $j < $num_summaries; $j++) {
+                $bm25_score[$i][$j] = $idf[$i] * $tf[$i][$j];
+            }
+        }
+        for ($i = 0; $i < $num_summaries; $i++) {
+            $val = 0;
+            for ($j = 0; $j < $num_terms; $j++) {
+                $val += $bm25_score[$j][$i];
+            }
+            $scores[$i] = $val;
+        }
+        return $scores;
+    }
+    /**
+     * Calculates the BM25 normalized term frequency of a set of terms in
+     * a collection of text summaries
+     *
+     * @param array $summaries list of summary strings to compute BM25TF w.r.t
+     * @param array $terms we want the term frequency computation for
+     * @return array $tfbm25 a 2d array with rows being indexed by terms and
+     *     columns indexed by summaries and the values of an entry being
+     *     the tfbm25 score for that term in that document
+     */
+    public static function calculateTFBM25($summaries, $terms)
+    {
+        $k1 = 1.5;
+        $b = 0.75;
+        $tf_values = [];
+        $tfbm25 = [];
+        $doc_length = strlen(implode("", $summaries));
+        $num_summaries = count($summaries);
+        if ($num_summaries!= 0) {
+            $avg_length = $doc_length / $num_summaries;
+        } else {
+            $avg_length = 0;
+        }
+        $avg_length = max($avg_length, 1);
+        $tf_values = self::calculateTermFreq($summaries, $terms);
+        $num_terms =count($terms);
+        for ($i = 0; $i < $num_terms; $i++) {
+            for ($j = 0; $j < $num_summaries; $j++) {
+                $frequency = $tf_values[$i][$j];
+                $tfbm25[$i][$j] =
+                    ($frequency * ($k1 + 1))/($frequency + $k1 *
+                    ((1 - $b) + $b * ($doc_length/$avg_length)));
+            }
+        }
+        return $tfbm25;
+    }
+    /**
+     * Computes a 2D array of the number of occurences of term i in document j
+     *
+     * @param array $summaries documents to compute frequencies in
+     * @param array $terms terms to compute frequencies for
+     * @return array 2D array as described above
+     */
+    public static function calculateTermFreq($summaries, $terms)
+    {
+        $tf_values = [];
+        $num_terms = count($terms);
+        $num_summaries = count($summaries);
+        for ($i = 0; $i < $num_terms; $i++) {
+            for ($j = 0; $j < $num_summaries; $j++) {
+                if ($terms[$i] != "") {
+                    $frequency = substr_count($summaries[$j], $terms[$i]);
+                    $tf_values[$i][$j] = $frequency;
+                } else {
+                    $tf_values[$i][$j] = 0;
+                }
+            }
+        }
+        return $tf_values;
+    }
+    /**
+     * To get the inverse document frequencies for a collection of terms in
+     * a set of documents.
+     * IDF(term_i) = log_10(# of document / # docs term i in)
+     *
+     * @param array $summaries documents to use in calculating IDF score
+     * @param array $terms terms to compute IDF score for
+     * @return array $idf 1D-array saying the inverse document frequency for
+     * each term
+     */
+    public static function calculateIDF($summaries, $terms)
+    {
+        $N = count($summaries);
+        $Nt = [];
+        $term_count = 0;
+        $num_terms = count($terms);
+        for ($i = 0; $i < $num_terms; $i++) {
+            $cnt_Nt = 0;
+            $term_count++;
+            foreach ($summaries as $summary)
+            {
+                if (stripos($summary, $terms[$i]) !== false) {
+                    $cnt_Nt++;
+                }
+            }
+            $Nt[$i] = $cnt_Nt;
+            $idf[$i] = ($Nt[$i] != 0) ? log10($N / $Nt[$i]) : 0;
+        }
+        return $idf;
+    }
+}
--- a/src/library/Trie.php
+++ b/src/library/Trie.php
@ -0,0 +1,180 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Sandhya Vissapragada, Chris Pollett (separated out this
+ *     code into a separate file and cleaned up)
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * Implements a trie data structure which can be used to store terms read
+ * from a dictionary in a succinct way
+ *
+ * @author Sandhya Vissapragada, Chris Pollett (rewrite +
+ *     documentation, multi-byte support)
+ */
+class Trie
+{
+    /**
+     * A nested array used to represent the trie
+     * @var array
+     */
+    public $trie_array;
+    /**
+     * The marker used to represent the end of an entry in a trie
+     * @var string
+     */
+    public $end_marker;
+    /**
+     * Creates and returnes an empty trie. Sets the end of term character
+     *
+     * @param string $end_marker end of term marker
+     */
+    public function __construct($end_marker = " ")
+    {
+        $this->trie_array = [];
+        $this->end_marker = $end_marker;
+    }
+    /**
+     * Adds a term to the Trie
+     *
+     * @param string $term the term to be inserted
+     * @return array $trie_array beneath last letter of term inserted
+     */
+    public function add($term)
+    {
+        $trie_array = & $this->trie_array;
+        $term_arr = explode(" ",$term);
+        if (!isset($term_arr[1])) {
+            $term_arr[1] = null;
+        }
+        for ($i = 0; $i < mb_strlen($term_arr[0],"utf-8"); $i++) {
+            $character = mb_substr($term_arr[0], $i, 1, "utf-8");
+            $enc_char = rawurlencode($character);
+            // To avoid encoding the linefeed
+            if ($enc_char == "%0A"){
+                continue;
+            }
+            else {
+                // If letter doesnt exist then create one by
+                // assigning new array
+                if (!isset($trie_array[$enc_char])) {
+                $trie_array[$enc_char] = [];
+                }
+                $trie_array = & $trie_array[$enc_char];
+            }
+        }
+        // Set end of term marker
+        $trie_array[$this->end_marker] = $term_arr[1];
+        return $trie_array;
+    }
+    /**
+     * Returns the sub trie_array under $term in
+     * $this->trie_array. If $term does not exist in $trie->trie_array
+     * returns false
+     *
+     * @param string $term term to look up
+     * @return array $trie_array subtrie under term
+     */
+    public function exists($term)
+    {
+        $trie_array = & $this->trie_array;
+        $len = mb_strlen($term,"utf-8");
+        for ($i = 0; $i < $len; $i++) {
+            if ($trie_array == null){
+                return false;
+            }
+            if ($trie_array != $this->end_marker) {
+                $character = mb_substr($term, $i, 1, "utf-8");
+                $enc_char = rawurlencode($character);
+                if (!isset($trie_array[$enc_char])) {
+                    return false;
+                }
+                if ($trie_array[$enc_char] != $this->end_marker) {
+                    $trie_array = & $trie_array[$enc_char];
+                }
+            }
+            else {
+                return false;
+            }
+        }
+        return $trie_array;
+    }
+    /**
+     * Returns all the terms in the trie beneath the provided term prefix
+     *
+     * @param string $prefix of term to look up
+     * @param int $max_results maximum number of strings to return
+     * @return array $terms under $prefix
+     */
+    public function getValues($prefix, $max_results)
+    {
+        $trie_array = $this->exists($prefix);
+        if (!$trie_array) {
+            return false;
+        }
+        return $this->getValuesTrieArray($trie_array, $prefix, $max_results);
+    }
+    /**
+     * Computes the suffixes $count,...$max_results-$count in the trie_array
+     * beneath the provided $find_more is true. Prepends $prefix to each
+     * and returns the array of the result.
+     *
+     * @param array $trie_array a nested array representing a trie to look
+     *     up suffixes in
+     * @param string $prefix to prepend to each found suffix
+     * @param int $max_results maximum number of strings to return
+     * @param int $count which suffix in trie_array to start with
+     * @param bool $find_more whether to try to look up or not (stops recursion)
+     * @return array $terms a list of ($prefix.suffix1, $prefix, $suffix2,...)
+     */
+    private function getValuesTrieArray($trie_array, $prefix, $max_results,
+        &$count = 0, &$find_more = true)
+    {
+        $end_marker = $this->end_marker;
+        $terms = [];
+        if ($trie_array != null && $find_more) {
+            foreach ($trie_array as $character => $subtrie) {
+                if ($character != $end_marker) {
+                    $new_terms =
+                        $this->getValuesTrieArray($subtrie,
+                            $prefix . urldecode($character),
+                            $max_results, $count, $find_more);
+                    $terms = array_merge($terms, $new_terms);
+                } else {
+                    $count++;
+                    if ($count > $max_results) {
+                        $find_more = false;
+                    }
+                    $terms[] = $prefix;
+                }
+            }
+        }
+        return $terms;
+    }
+}
--- a/src/library/UnitTest.php
+++ b/src/library/UnitTest.php
@ -0,0 +1,169 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+/**
+ * Base class for all the SeekQuarry/Yioop engine Unit tests
+ *
+ * @author Chris Pollett
+ */
+abstract class UnitTest
+{
+    /**
+     * Used to store the results for each test sub case
+     */
+    public $test_case_results;
+    /**
+     * Used to hold objects to be used in tests
+     * @var array
+     */
+    public $test_objects;
+    /**
+     * The suffix that all TestCase methods need to have to be called by run()
+     */
+    const case_name = "TestCase";
+    /**
+     * Contructor should be overriden to do any set up that occurs before
+     * and test cases
+     */
+    public function __construct()
+    {
+    }
+    /**
+     * Execute each of the test cases of this unit test and return the results
+     * @return array test case results
+     */
+    public function run()
+    {
+        $test_results = [];
+        $methods = get_class_methods(get_class($this));
+        foreach ($methods as $method) {
+            $this->test_objects = null;
+            $this->setUp();
+            $len = strlen($method);
+            if (substr_compare(
+                $method, self::case_name, $len - strlen(self::case_name)) == 0){
+                $this->test_case_results = [];
+                $this->$method();
+                $test_results[$method] = $this->test_case_results;
+            }
+            $this->tearDown();
+        }
+        return $test_results;
+    }
+    /**
+     * Checks that $x can coerced to true, the result of the
+     * test is added to $this->test_case_results
+     *
+     * @param mixed $x item to check
+     * @param string $description information about this test subcase
+     */
+    public function assertTrue($x, $description = "")
+    {
+        $sub_case_num = count($this->test_case_results);
+        $test = [];
+        $test['NAME'] = "Case Test $sub_case_num assertTrue $description";
+        if ($x) {
+            $test['PASS'] = true;
+        } else {
+            $test['PASS'] = false;
+        }
+        $this->test_case_results[] = $test;
+    }
+    /**
+     * Checks that $x can coerced to false, the result of the
+     * test is added to $this->test_case_results
+     *
+     * @param mixed $x item to check
+     * @param string $description information about this test subcase
+     */
+    public function assertFalse($x, $description = "")
+    {
+        $sub_case_num = count($this->test_case_results);
+        $test = [];
+        $test['NAME'] = "Case Test $sub_case_num assertFalse $description";
+        if (!$x) {
+            $test['PASS'] = true;
+        } else {
+            $test['PASS'] = false;
+        }
+        $this->test_case_results[] = $test;
+    }
+    /**
+     * Checks that $x and $y are the same, the result of the
+     * test is added to $this->test_case_results
+     *
+     * @param mixed $x a first item to compare
+     * @param mixed $y a second item to compare
+     * @param string $description information about this test subcase
+     */
+    public function assertEqual($x, $y, $description = "")
+    {
+        $sub_case_num = count($this->test_case_results);
+        $test = [];
+        $test['NAME'] = "Case Test $sub_case_num assertEqual $description";
+        if ($x == $y) {
+            $test['PASS'] = true;
+        } else {
+            $test['PASS'] = false;
+        }
+        $this->test_case_results[] = $test;
+    }
+    /**
+     * Checks that $x and $y are not the same, the result of the
+     * test is added to $this->test_case_results
+     *
+     * @param mixed $x a first item to compare
+     * @param mixed $y a second item to compare
+     * @param string $description information about this test subcase
+     */
+    public function assertNotEqual($x, $y, $description = "")
+    {
+        $sub_case_num = count($this->test_case_results);
+        $test = [];
+        $test['NAME'] = "Case Test $sub_case_num assertNotEqual $description";
+        if ($x != $y) {
+            $test['PASS'] = true;
+        } else {
+            $test['PASS'] = false;
+        }
+        $this->test_case_results[] = $test;
+    }
+    /**
+     * This method is called before each test case is run to set up the
+     * given test case
+     */
+    abstract function setUp();
+    /**
+     * This method is called after each test case is run to clean up
+     */
+    abstract function tearDown();
+}
--- a/src/library/UpgradeFunctions.php
+++ b/src/library/UpgradeFunctions.php
@ -0,0 +1,294 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * This file contains global functions to check whether
+ * upgrading the database or locales is needed as wells as auxiliary functions
+ * to be used by the VersionFunctions.php code to actually carry out
+ * upgrades between versions
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\models as M;
+use seekquarry\yioop\models\datasources as D;
+
+/** For Yioop global defines */
+require_once __DIR__."/../configs/Config.php";
+/**
+ * Checks to see if the locale data of Yioop! of a locale in the work dir is
+ * older than the currently running Yioop!
+ *
+ * @param string $locale_tag locale to check directory of
+ */
+function upgradeLocalesCheck($locale_tag)
+{
+    if (!C\PROFILE) {
+        return;
+    }
+    $dir_locale_tag = str_replace("-", "_", $locale_tag);
+    $config_name = C\LOCALE_DIR."/$dir_locale_tag/configure.ini";
+    $fallback_config_name =
+        C\FALLBACK_LOCALE_DIR."/$dir_locale_tag/configure.ini";
+    if (filemtime($fallback_config_name) > filemtime($config_name)) {
+        return "locale";
+    }
+    return false;
+}
+/**
+ * If the locale data of Yioop! in the work directory is older than the
+ * currently running Yioop! then this function is called to at least
+ * try to copy the new strings into the old profile.
+ */
+function upgradeLocales()
+{
+    if (!C\PROFILE) return;
+    $locale = new M\LocaleModel();
+    $locale->initialize(C\DEFAULT_LOCALE);
+    $force_folders = [];
+    /*
+        if we're upgrading version2 to 3 we want to make sure stemmer becomes
+        tokenizer, version3 to 4 pushes out stopwordsRemover used for
+        summarization. version 6 to 7 adds stemmers for french, english,
+        german.
+        version 7 to 8 adds stemmers for russian and spanish
+    */
+    if (empty($locale->configure['strings']["view_locale_version15"])) {
+        $force_folders = ["resources"];
+        upgradePublicHelpWiki($locale->db);
+    }
+    $locale->extractMergeLocales($force_folders);
+}
+/**
+ * Used to force push the default Public and Wiki pages into the current
+ * database
+ * @param object& $db datasource to use to upgrade
+ */
+function upgradePublicHelpWiki(&$db)
+{
+    /** For new wiki pages */
+    require_once C\BASE_DIR."/configs/PublicHelpPages.php";
+    $group_model = new M\GroupModel(C\DB_NAME, false);
+    $group_model->db = $db;
+    $default_locale = getLocaleTag();
+    foreach ($public_pages as $locale_tag => $locale_pages) {
+        setLocaleObject($locale_tag);
+        foreach ($locale_pages as $page_name => $page_content) {
+            $group_model->setPageName(C\ROOT_ID, C\PUBLIC_USER_ID, $page_name,
+                $page_content, $locale_tag, "",
+                tl('social_component_page_created', $page_name),
+                tl('social_component_page_discuss_here'));
+        }
+    }
+    //Insert Default Public Help pages
+    foreach ($help_pages as $locale_tag => $locale_pages) {
+        setLocaleObject($locale_tag);
+        foreach ($locale_pages as $page_name => $page_content) {
+            $group_model->setPageName(C\ROOT_ID, C\HELP_GROUP_ID, $page_name,
+                $page_content, $locale_tag, "",
+                tl('social_component_page_created', $page_name),
+                tl('social_component_page_discuss_here'));
+        }
+    }
+    setLocaleObject($default_locale);
+}
+/**
+ * Checks to see if the database data or work_dir folder of Yioop! is from an
+ * older version of Yioop! than the currently running Yioop!
+ */
+function upgradeDatabaseWorkDirectoryCheck()
+{
+    $model = new M\Model();
+    $sql = "SELECT ID FROM VERSION";
+    for ($i = 0; $i < 3; $i++) {
+        $result = @$model->db->execute($sql);
+        if ($result !== false) {
+            $row = $model->db->fetchArray($result);
+            if ((isset($row['ID']) && $row['ID'] >= C\YIOOP_VERSION) ||
+                (isset($row['id']) && $row['id'] >= C\YIOOP_VERSION)) {
+                return false;
+            } else {
+                return true;
+            }
+        }
+        sleep(3);
+    }
+    exit();
+}
+/**
+ * If the database data of Yioop is older than the version of the
+ * currently running Yioop then this function is called to try
+ * upgrade the database to the new version
+ */
+function upgradeDatabaseWorkDirectory()
+{
+    $model = new M\Model();
+    $sql = "SELECT ID FROM VERSION";
+    $result = @$model->db->execute($sql);
+    if ($result !== false) {
+        $row = $model->db->fetchArray($result);
+        if (!empty($row['ID'])) {
+            $current_version = min($row['ID'], C\YIOOP_VERSION);
+        } else if (!empty($row['id'])) {
+            $current_version = min($row['id'], C\YIOOP_VERSION);
+        } else {
+            $current_version = 1;
+        }
+    } else {
+        exit(); // maybe someone else has locked DB, so bail
+    }
+    $result = null; //don't lock db if sqlite
+    $versions = range(1, C\YIOOP_VERSION);
+    $key = array_search($current_version, $versions);
+    $versions = array_slice($versions, $key + 1);
+    foreach ($versions as $version) {
+        $upgrade_db = C\NS_LIB . "upgradeDatabaseVersion$version";
+        if (function_exists($upgrade_db)) {
+            $upgrade_db($model->db);
+        }
+    }
+    updateVersionNumber($model->db, C\YIOOP_VERSION);
+}
+
+/**
+ * Update the database version number to a new number
+ * @param object $db datasource for Yioop database
+ * @param int $number the new database number
+ */
+function updateVersionNumber(&$db, $number)
+{
+    $db->execute("DELETE FROM VERSION");
+    $db->execute("INSERT INTO VERSION VALUES ($number)");
+}
+/**
+ * Reads the Help articles from default db and returns the array of pages.
+ */
+function getWikiHelpPages()
+{
+    $help_pages = [];
+    $default_dbm = new D\Sqlite3Manager();
+    $default_dbm->connect("", "", "", C\BASE_DIR . "/data/default.db");
+    if (!$default_dbm) {
+        return false;
+    }
+    $group_model = new M\GroupModel(C\DB_NAME, true);
+    $group_model->db = $default_dbm;
+    $page_list = $group_model->getPageList(
+        C\HELP_GROUP_ID, "en-US", '', 0, 200);
+    foreach ($page_list[1] as $page) {
+        if (isset($page['TITLE'])) {
+            $page_info = $group_model->getPageInfoByName(
+                C\HELP_GROUP_ID, $page['TITLE'], "en-US", "api");
+            $page_content = str_replace("&amp;", "&", $page_info['PAGE']);
+            $page_content = html_entity_decode($page_content, ENT_QUOTES,
+                "UTF-8");
+            $help_pages[$page['TITLE']] = $page_content;
+        }
+    }
+    return $help_pages;
+}
+/**
+ * Used to insert a new activity into the database at a given acitivity_id
+ *
+ * Inserting at an ID rather than at the end is useful since activities are
+ * displayed in admin panel in order of increasing id.
+ *
+ * @param resource& $db database handle where Yioop database stored
+ * @param string $string_id message identifier to give translations for
+ *     for activity
+ * @param string  $method_name admin_controller method to be called to perform
+ *      this activity
+ * @param int $activity_id the id location at which to create this activity
+ *     activity at and below this location will be shifted down by 1.
+ */
+function addActivityAtId(&$db, $string_id, $method_name, $activity_id)
+{
+    $db->execute("UPDATE ACTIVITY SET ACTIVITY_ID = ACTIVITY_ID + 1 WHERE ".
+        "ACTIVITY_ID >= ?", [$activity_id]);
+    $sql = "SELECT * FROM ACTIVITY WHERE ACTIVITY_ID >= ?
+        ORDER BY ACTIVITY_ID DESC";
+    $result = $db->execute($sql, [$activity_id]);
+    while ($row = $db->fetchArray($result)) {
+        $db->execute("INSERT INTO ACTIVITY VALUES (?, ?, ?)",
+            [($row['ACTIVITY_ID'] + 1), $row['TRANSLATION_ID'],
+            $row['METHOD_NAME']]);
+        $db->execute("DELETE FROM ACTIVITY WHERE ACTIVITY_ID = ?",
+            [$row['ACTIVITY_ID']]);
+    }
+    if (!in_array($method_name, ["manageAdvertisements", "manageCredits"])) {
+        $db->execute("UPDATE ROLE_ACTIVITY SET ACTIVITY_ID = ACTIVITY_ID + 1 ".
+            "WHERE ACTIVITY_ID >= ?", [$activity_id]);
+        //give root account permissions on the activity.
+        $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, ?)",
+            [$activity_id]);
+    }
+    $sql = "SELECT COUNT(*) AS NUM FROM TRANSLATION";
+    $result = $db->execute($sql);
+    if (!$result || !($row = $db->fetchArray($result))) {
+        echo "Upgrade activity error";
+        exit();
+    }
+    //some search id start at 1000, so +1001 ensures we steer clear of them
+    $translation_id = $row['NUM'] + 1001;
+    $db->execute("INSERT INTO ACTIVITY VALUES (?, ?, ?)",
+        [$activity_id, $translation_id, $method_name]);
+    $db->execute("INSERT INTO TRANSLATION VALUES (?, ?)",
+        [$translation_id, $string_id]);
+}
+/**
+ * Adds or replaces a translation for a database message string for a given
+ * IANA locale tag.
+ *
+ * @param resource& $db database handle where Yioop database stored
+ * @param string $string_id message identifier to give translation for
+ * @param string $locale_tag  the IANA language tag to update the strings of
+ * @param string $translation the translation for $string_id in the language
+ *     $locale_tag
+ */
+function updateTranslationForStringId(&$db, $string_id, $locale_tag,
+    $translation)
+{
+    $sql = "SELECT LOCALE_ID FROM LOCALE ".
+        "WHERE LOCALE_TAG = ? " . $db->limitOffset(1);
+    $result = $db->execute($sql, [$locale_tag]);
+    $row = $db->fetchArray($result);
+    $locale_id = $row['LOCALE_ID'];
+
+    $sql = "SELECT TRANSLATION_ID FROM TRANSLATION ".
+        "WHERE IDENTIFIER_STRING = ? " . $db->limitOffset(1);
+    $result = $db->execute($sql, [$string_id]);
+    $row = $db->fetchArray($result);
+    $translate_id = $row['TRANSLATION_ID'];
+    $sql = "DELETE FROM TRANSLATION_LOCALE ".
+        "WHERE TRANSLATION_ID =? AND ".
+        "LOCALE_ID = ?";
+    $result = $db->execute($sql, [$translate_id, $locale_id]);
+    $sql = "INSERT INTO TRANSLATION_LOCALE VALUES (?, ?, ?)";
+    $result = $db->execute($sql, [$translate_id, $locale_id, $translation]);
+}
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@ -0,0 +1,393 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2017
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * Loads crawlLog functions if needed
+ */
+require_once __DIR__."/Utility.php";
+/**
+ *
+ * Code used to manage web archive files
+ *
+ * @author Chris Pollett
+ */
+class WebArchive
+{
+    /**
+     * Filename used to store the web archive.
+     * @var string
+     */
+    public $filename;
+    /**
+     *
+     * Current offset into the web archive the iterator for the archive is at
+     * (at most one iterator / archive -- oh well)
+     * @var int
+     */
+    public $iterator_pos;
+    /**
+     * Filter object used to compress/uncompress objects stored in archive
+     * @var object
+     */
+    public $compressor;
+    /**
+     * number of item in archive
+     * @var int
+     */
+    public $count;
+    /**
+     * version number of the current archive
+     * @var float
+     */
+    public $version;
+    /**
+     * Says whether the archive is a string archive
+     * @var bool
+     */
+    public $is_string;
+    /**
+     * If archive is stored as a string rather than persistently to disk
+     * then $storage is used to hold the string
+     * @var string
+     */
+    public $storage;
+    /**
+     * Version number to use in the WebArchive header if constructing a new
+     * archive
+     */
+    const WEB_ARCHIVE_VERSION = 1.0;
+    /**
+     * Makes or initializes a WebArchive object using the supplied parameters
+     *
+     * @param string $fname filename to use to store archive to disk
+     * @param string $compressor what kind of Compressor object should be
+     *     used to read and write objects in the archive
+     * @param bool $fast_construct do we read the info block of the web
+     *     archive as part of the constructing process
+     * @param bool $is_string says whether the archive stores to string
+     *     rather than a file
+     */
+    public function __construct($fname, $compressor, $fast_construct = false,
+        $is_string = false)
+    {
+        $this->filename = $fname;
+        $this->compressor = $compressor;
+        $this->is_string = $is_string;
+        if ($this->is_string) {
+            $this->storage = "";
+            $this->iterator_pos = 0;
+            $this->count = 0;
+            return;
+        }
+        if (file_exists($fname)) {
+            if (!$fast_construct) {
+                $this->readInfoBlock();
+            }
+            $this->iterator_pos = 0;
+        } else {
+            $this->iterator_pos = 0;
+            $this->count = 0;
+            $fh =  fopen($this->filename, "w");
+            $this->writeInfoBlock($fh);
+            fclose($fh);
+        }
+    }
+    /**
+     * Read the info block associated with this web archive.
+     * The info block is meta data for the archive stored at the end of
+     * the WebArchive file. The particular meta is up to who is using
+     * the web archive.
+     * @return array the contents of the info block
+     */
+    public function readInfoBlock()
+    {
+        if ($this->is_string) {
+            return null;
+        }
+        $fh =  fopen($this->filename, "r");
+        $len = $this->seekEndObjects($fh);
+        $info_string = fread($fh, $len);
+        fclose($fh);
+        $info_block = unserialize($this->compressor->uncompress($info_string));
+        $this->count = $info_block["count"];
+        $this->version = $info_block["version"];
+        if (isset($info_block["data"])) {
+            return $info_block["data"];
+        } else {
+            return null;
+        }
+    }
+    /**
+     * Serializes and applies the compressor to an info block and write it at
+     * the end of the web archive
+     * The info block is meta data for the archive stored at the end of
+     * the WebArchive file. The particular meta is up to who is using
+     * the web archive; however, count and archive version number are always
+     * stored
+     *
+     * @param resource $fh resource for the web archive file. If null
+     *     the web archive is open first and close when the data is written
+     * @param array& $data data to write into the info block of the archive
+     */
+    public function writeInfoBlock($fh = null, &$data = null)
+    {
+        if ($this->is_string) return;
+        $compressed_int_len = $this->compressor->compressedIntLen();
+        $open_flag = false;
+        if ($fh == null) {
+            $open_flag = true;
+            $fh =  fopen($this->filename, "r+");
+            $this->seekEndObjects($fh);
+        }
+        $info_block = [];
+        $info_block["version"] = self::WEB_ARCHIVE_VERSION;
+        $info_block["count"] = $this->count;
+        if ($data != null) {
+            $info_block['data'] = & $data;
+        }
+        $info_string =
+            $this->compressor->compress(serialize($info_block));
+        $len = strlen($info_string) + $compressed_int_len;
+
+        $offset = ftell($fh);
+        ftruncate($fh, $offset);
+
+        $out = $info_string.$this->compressor->compressInt($len);
+        fwrite($fh, $out, $len);
+
+        if ($open_flag) {
+            fclose($fh);
+        }
+    }
+    /**
+     * Seeks in the WebArchive file to the end of the last Object.
+     *
+     * The last $compressed_int_len bytes of a WebArchive say the length
+     * of an info block in bytes
+     *
+     * @param resource $fh resource for the WebArchive file
+     * @return int offset length of info block
+     */
+    public function seekEndObjects($fh)
+    {
+        if ($this->is_string) {
+            return strlen($this->storage);
+        }
+        $compressed_int_len = $this->compressor->compressedIntLen();
+        fseek($fh, - $compressed_int_len, SEEK_END);
+        $len_block = $this->compressor->uncompressInt(
+            fread($fh, $compressed_int_len));
+        fseek($fh, - ($len_block), SEEK_END);
+        return $len_block - $compressed_int_len;
+    }
+    /**
+     * Adds objects to the WebArchive
+     *
+     * @param string $offset_field field in objects to return the byte offset
+     *     at which they were stored
+     * @param array& $objects references to objects that will be stored
+     *     the offset field in these references will be adjusted if
+     * @param array $data data to write in the WebArchive's info block
+     * @param string $callback name of a callback
+     *     $callback($data, $new_objects, $offset_field)
+     *     used to modify $data before it is written
+     *     to the info block. For instance, we can add offset info to data.
+     * @param bool $return_flag if true rather than adjust the offsets by
+     *     reference, create copy objects and adjust their offsets anf return
+     * @return mixed adjusted objects or void
+     */
+    public function addObjects($offset_field, &$objects,
+        $data = null, $callback = null, $return_flag = true)
+    {
+        $is_string = $this->is_string;
+        if (!$is_string) {
+            $fh =  fopen($this->filename, "r+");
+            $this->seekEndObjects($fh);
+            $offset = ftell($fh);
+            ftruncate($fh, $offset);
+        } else {
+            $offset = strlen($this->storage);
+        }
+        $out = "";
+        if ($return_flag) {
+            $new_objects = $objects;
+        } else {
+            $new_objects = & $objects;
+        }
+        $num_objects = count($new_objects);
+        $compressed_int_len = $this->compressor->compressedIntLen();
+        for ($i = 0; $i < $num_objects; $i++) {
+            $new_objects[$i][$offset_field] = $offset;
+            $file = serialize($new_objects[$i]);
+            $compressed_file = $this->compressor->compress($file);
+            $len = strlen($compressed_file);
+            $out .= $this->compressor->compressInt($len) . $compressed_file;
+            $offset += $len + $compressed_int_len;
+        }
+        $this->count += $num_objects;
+        if ($is_string) {
+            $this->storage .= $out;
+        } else {
+            fwrite($fh, $out, strlen($out));
+        }
+        if ($data != null && $callback != null) {
+            $data = $callback($data, $new_objects, $offset_field);
+        }
+        if (!$is_string) {
+            $this->writeInfoBlock($fh, $data);
+            fclose($fh);
+        }
+        if ($return_flag) {
+            return $new_objects;
+        } else {
+            return;
+        }
+    }
+    /**
+     * Open the web archive file associated with this WebArchive object.
+     *
+     * @param string $mode read/write mode to open file with
+     * @return resource a file resource for the web archive
+     */
+    public function open($mode = "r")
+    {
+        if ($this->is_string) {
+            return "is_string";
+        }
+        $fh = fopen($this->filename, $mode);
+        return $fh;
+    }
+    /**
+     * Closes a file handle (which should be of a web archive)
+     *
+     * @param resource $fh filehandle to close
+     */
+    public function close($fh)
+    {
+        if ($this->is_string) return;
+        fclose($fh);
+    }
+    /**
+     * Gets $num many objects out of the web archive starting at byte $offset
+     *
+     * If the $next_flag is true the archive iterator is advance and if $fh
+     * is not null then it is assumed to be an open resource pointing to the
+     * archive (saving the time to open it).
+     *
+     * @param int $offset a valid byte offset into a web archive
+     * @param int $num number of objects to return
+     * @param bool $next_flag whether to advance the archive iterator
+     * @param resource $fh either null or a file resource to the archive
+     * @return array the $num objects beginning at $offset
+     */
+    public function getObjects($offset, $num, $next_flag = true, $fh = null)
+    {
+        $open_flag = false;
+        if ($fh == null) {
+            $fh =  $this->open();
+            $open_flag = true;
+        }
+        $is_string = $this->is_string;
+        $objects = [];
+        $compressed_int_len = $this->compressor->compressedIntLen();
+        if ($is_string) {
+            $storage_len = strlen($this->storage);
+        }
+        if ((!$is_string &&fseek($fh, $offset) == 0 ) || ($is_string
+            && $offset < $storage_len)) {
+            for ($i = 0; $i < $num; $i++) {
+                if (!$is_string && feof($fh)) {break; }
+                if ($is_string && $offset >= $storage_len) {break; }
+                $object = null;
+                $compressed_len = ($is_string)
+                    ? substr($this->storage, $offset, $compressed_int_len)
+                    : fread($fh, $compressed_int_len);
+                $len = $this->compressor->uncompressInt($compressed_len);
+                if ($len > 0 && $len < C\MAX_ARCHIVE_OBJECT_SIZE) {
+                    $compressed_file = ($is_string)
+                        ? substr($this->storage, $offset + $compressed_int_len,
+                            $len)
+                        : fread($fh, $len);
+                    restore_error_handler();
+                    $file = $this->compressor->uncompress($compressed_file);
+                    $object = @unserialize($file);
+                    set_error_handler(C\NS_LIB . "yioop_error_handler");
+                    $offset += $compressed_int_len + $len;
+                    $objects[] = [$offset, $object];
+                } else {
+                    crawlLog("Web archive saw blank line ".
+                        "when looked for offset $offset");
+                }
+            }
+            if ($next_flag) {
+                $this->iterator_pos = $offset;
+            }
+        }
+        if ($open_flag) {
+            $this->close($fh);
+        }
+        return $objects;
+    }
+    /**
+     * Returns $num many objects from the web archive starting at the current
+     * iterator position, leaving the iterator position unchanged
+     *
+     * @param int $num number of objects to return
+     * @return array an array of objects from the web archive
+     */
+    public function currentObjects($num)
+    {
+        return $this->getObjects($this->iterator_pos, $num, false);
+    }
+    /**
+     * Returns $num many objects from the web archive starting at the
+     * current iterator position. The iterator is advance to the object
+     * after the last one returned
+     *
+     * @param int $num number of objects to return
+     * @return array an array of objects from the web archive
+     */
+    public function nextObjects($num)
+    {
+        return $this->getObjects($this->iterator_pos, $num);
+    }
+    /**
+     * Resets the iterator for this web archive to the first object
+     * in the archive
+     */
+    public function reset()
+    {
+        $this->iterator_pos = 0;
+    }
+}
--- a/Show more
+++ b/Show more