Optimise imports
This commit is contained in:
parent
312f32bf61
commit
14817d7657
3 changed files with 52 additions and 52 deletions
17
extract.py
17
extract.py
|
@ -2,13 +2,10 @@
|
|||
Extract content from HTML files and store it as compressed JSON
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import spacy as spacy
|
||||
import boto3
|
||||
from justext import get_stoplist
|
||||
from justext.core import LENGTH_LOW_DEFAULT, LENGTH_HIGH_DEFAULT, STOPWORDS_LOW_DEFAULT, \
|
||||
STOPWORDS_HIGH_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, \
|
||||
|
@ -16,18 +13,11 @@ from justext.core import LENGTH_LOW_DEFAULT, LENGTH_HIGH_DEFAULT, STOPWORDS_LOW_
|
|||
ParagraphMaker, classify_paragraphs, revise_paragraph_classification
|
||||
from langdetect import detect
|
||||
from lxml.etree import ParserError
|
||||
from pyspark.sql import DataFrame
|
||||
from pyspark.sql import SparkSession, SQLContext
|
||||
from pyspark.sql.functions import col
|
||||
from pyspark.sql.types import StructType, StructField, StringType, LongType
|
||||
from pyspark.sql.functions import expr, col
|
||||
from pyspark import SparkFiles
|
||||
|
||||
|
||||
|
||||
import boto3
|
||||
from warcio import ArchiveIterator
|
||||
|
||||
|
||||
OUTPUT_PATH = 's3://tinysearch/outputs/index'
|
||||
|
||||
MAX_URI_LENGTH = 150
|
||||
|
@ -39,9 +29,6 @@ MAX_RESULTS_PER_HASH = 200
|
|||
PAGE_SIZE = 4096
|
||||
|
||||
|
||||
nlp = spacy.load("en_core_web_sm", disable=['lemmatizer', 'ner'])
|
||||
|
||||
|
||||
index_schema = StructType([
|
||||
StructField("term_hash", LongType(), False),
|
||||
StructField("data", StringType(), False),
|
||||
|
|
85
poetry.lock
generated
85
poetry.lock
generated
|
@ -72,6 +72,17 @@ python-versions = "*"
|
|||
[package.dependencies]
|
||||
lxml = ">=4.4.2"
|
||||
|
||||
[[package]]
|
||||
name = "langdetect"
|
||||
version = "1.0.9"
|
||||
description = "Language detection library ported from Google's language-detection."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[package.dependencies]
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "lxml"
|
||||
version = "4.6.4"
|
||||
|
@ -86,14 +97,6 @@ html5 = ["html5lib"]
|
|||
htmlsoup = ["beautifulsoup4"]
|
||||
source = ["Cython (>=0.29.7)"]
|
||||
|
||||
[[package]]
|
||||
name = "numpy"
|
||||
version = "1.21.1"
|
||||
description = "NumPy is the fundamental package for array computing with Python."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
|
||||
[[package]]
|
||||
name = "numpy"
|
||||
version = "1.21.4"
|
||||
|
@ -123,6 +126,31 @@ pytz = ">=2017.3"
|
|||
[package.extras]
|
||||
test = ["hypothesis (>=3.58)", "pytest (>=6.0)", "pytest-xdist"]
|
||||
|
||||
[[package]]
|
||||
name = "py4j"
|
||||
version = "0.10.9.2"
|
||||
description = "Enables Python programs to dynamically access arbitrary Java objects"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pyspark"
|
||||
version = "3.2.0"
|
||||
description = "Apache Spark Python API"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
|
||||
[package.dependencies]
|
||||
py4j = "0.10.9.2"
|
||||
|
||||
[package.extras]
|
||||
ml = ["numpy (>=1.7)"]
|
||||
mllib = ["numpy (>=1.7)"]
|
||||
pandas_on_spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
|
||||
sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.8.2"
|
||||
|
@ -207,7 +235,7 @@ six = "*"
|
|||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "b8a9967839a08627a26c7d1517ef8dc0c08b9c0f15cdeee6efc9381e07d522ab"
|
||||
content-hash = "289e6a040c51398a25649cea5969bdb398893f8dbb62d7058a14b017d8fa82a5"
|
||||
|
||||
[metadata.files]
|
||||
beautifulsoup4 = [
|
||||
|
@ -234,6 +262,10 @@ justext = [
|
|||
{file = "jusText-3.0.0-py2.py3-none-any.whl", hash = "sha256:86b48f5b1d99505acd072f5831def6cd3f1306043651c524a1c609e62e3544e4"},
|
||||
{file = "jusText-3.0.0.tar.gz", hash = "sha256:7640e248218795f6be65f6c35fe697325a3280fcb4675d1525bcdff2b86faadf"},
|
||||
]
|
||||
langdetect = [
|
||||
{file = "langdetect-1.0.9-py2-none-any.whl", hash = "sha256:7cbc0746252f19e76f77c0b1690aadf01963be835ef0cd4b56dddf2a8f1dfc2a"},
|
||||
{file = "langdetect-1.0.9.tar.gz", hash = "sha256:cbc1fef89f8d062739774bd51eda3da3274006b3661d199c2655f6b3f6d605a0"},
|
||||
]
|
||||
lxml = [
|
||||
{file = "lxml-4.6.4-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:bbf2dc330bd44bfc0254ab37677ec60f7c7ecea55ad8ba1b8b2ea7bf20c265f5"},
|
||||
{file = "lxml-4.6.4-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b667c51682fe9b9788c69465956baa8b6999531876ccedcafc895c74ad716cd8"},
|
||||
|
@ -297,34 +329,6 @@ lxml = [
|
|||
{file = "lxml-4.6.4.tar.gz", hash = "sha256:daf9bd1fee31f1c7a5928b3e1059e09a8d683ea58fb3ffc773b6c88cb8d1399c"},
|
||||
]
|
||||
numpy = [
|
||||
{file = "numpy-1.21.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38e8648f9449a549a7dfe8d8755a5979b45b3538520d1e735637ef28e8c2dc50"},
|
||||
{file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fd7d7409fa643a91d0a05c7554dd68aa9c9bb16e186f6ccfe40d6e003156e33a"},
|
||||
{file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a75b4498b1e93d8b700282dc8e655b8bd559c0904b3910b144646dbbbc03e062"},
|
||||
{file = "numpy-1.21.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1412aa0aec3e00bc23fbb8664d76552b4efde98fb71f60737c83efbac24112f1"},
|
||||
{file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e46ceaff65609b5399163de5893d8f2a82d3c77d5e56d976c8b5fb01faa6b671"},
|
||||
{file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c6a2324085dd52f96498419ba95b5777e40b6bcbc20088fddb9e8cbb58885e8e"},
|
||||
{file = "numpy-1.21.1-cp37-cp37m-win32.whl", hash = "sha256:73101b2a1fef16602696d133db402a7e7586654682244344b8329cdcbbb82172"},
|
||||
{file = "numpy-1.21.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7a708a79c9a9d26904d1cca8d383bf869edf6f8e7650d85dbc77b041e8c5a0f8"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95b995d0c413f5d0428b3f880e8fe1660ff9396dcd1f9eedbc311f37b5652e16"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:635e6bd31c9fb3d475c8f44a089569070d10a9ef18ed13738b03049280281267"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4a3d5fb89bfe21be2ef47c0614b9c9c707b7362386c9a3ff1feae63e0267ccb6"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a326af80e86d0e9ce92bcc1e65c8ff88297de4fa14ee936cb2293d414c9ec63"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:791492091744b0fe390a6ce85cc1bf5149968ac7d5f0477288f78c89b385d9af"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0318c465786c1f63ac05d7c4dbcecd4d2d7e13f0959b01b534ea1e92202235c5"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a513bd9c1551894ee3d31369f9b07460ef223694098cf27d399513415855b68"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:91c6f5fc58df1e0a3cc0c3a717bb3308ff850abdaa6d2d802573ee2b11f674a8"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-win32.whl", hash = "sha256:978010b68e17150db8765355d1ccdd450f9fc916824e8c4e35ee620590e234cd"},
|
||||
{file = "numpy-1.21.1-cp38-cp38-win_amd64.whl", hash = "sha256:9749a40a5b22333467f02fe11edc98f022133ee1bfa8ab99bda5e5437b831214"},
|
||||
{file = "numpy-1.21.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d7a4aeac3b94af92a9373d6e77b37691b86411f9745190d2c351f410ab3a791f"},
|
||||
{file = "numpy-1.21.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d9e7912a56108aba9b31df688a4c4f5cb0d9d3787386b87d504762b6754fbb1b"},
|
||||
{file = "numpy-1.21.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:25b40b98ebdd272bc3020935427a4530b7d60dfbe1ab9381a39147834e985eac"},
|
||||
{file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a92c5aea763d14ba9d6475803fc7904bda7decc2a0a68153f587ad82941fec1"},
|
||||
{file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:05a0f648eb28bae4bcb204e6fd14603de2908de982e761a2fc78efe0f19e96e1"},
|
||||
{file = "numpy-1.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f01f28075a92eede918b965e86e8f0ba7b7797a95aa8d35e1cc8821f5fc3ad6a"},
|
||||
{file = "numpy-1.21.1-cp39-cp39-win32.whl", hash = "sha256:88c0b89ad1cc24a5efbb99ff9ab5db0f9a86e9cc50240177a571fbe9c2860ac2"},
|
||||
{file = "numpy-1.21.1-cp39-cp39-win_amd64.whl", hash = "sha256:01721eefe70544d548425a07c80be8377096a54118070b8a62476866d5208e33"},
|
||||
{file = "numpy-1.21.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d4d1de6e6fb3d28781c73fbde702ac97f03d79e4ffd6598b880b2d95d62ead4"},
|
||||
{file = "numpy-1.21.1.zip", hash = "sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd"},
|
||||
{file = "numpy-1.21.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8890b3360f345e8360133bc078d2dacc2843b6ee6059b568781b15b97acbe39f"},
|
||||
{file = "numpy-1.21.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:69077388c5a4b997442b843dbdc3a85b420fb693ec8e33020bb24d647c164fa5"},
|
||||
{file = "numpy-1.21.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e89717274b41ebd568cd7943fc9418eeb49b1785b66031bc8a7f6300463c5898"},
|
||||
|
@ -382,6 +386,13 @@ pandas = [
|
|||
{file = "pandas-1.3.4-cp39-cp39-win_amd64.whl", hash = "sha256:a51528192755f7429c5bcc9e80832c517340317c861318fea9cea081b57c9afd"},
|
||||
{file = "pandas-1.3.4.tar.gz", hash = "sha256:a2aa18d3f0b7d538e21932f637fbfe8518d085238b429e4790a35e1e44a96ffc"},
|
||||
]
|
||||
py4j = [
|
||||
{file = "py4j-0.10.9.2-py2.py3-none-any.whl", hash = "sha256:bf0485388e415ff26710d2dc719cb0ede16cf1164b1ee757e0ebb2e98c471521"},
|
||||
{file = "py4j-0.10.9.2.tar.gz", hash = "sha256:624f97c363b8dd84822bc666b12fa7f7d97824632b2ff3d852cc491359ce7615"},
|
||||
]
|
||||
pyspark = [
|
||||
{file = "pyspark-3.2.0.tar.gz", hash = "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"},
|
||||
]
|
||||
python-dateutil = [
|
||||
{file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
|
||||
{file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
|
||||
|
|
|
@ -15,6 +15,8 @@ beautifulsoup4 = "^4.10.0"
|
|||
lxml = "^4.6.4"
|
||||
jusText = "^3.0.0"
|
||||
pandas = "^1.3.4"
|
||||
pyspark = "^3.2.0"
|
||||
langdetect = "^1.0.9"
|
||||
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
|
|
Loading…
Reference in a new issue