diff --git a/tools/scraper/.gitignore b/tools/scraper/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..952980996a469f37f22b11f630b19c0e9b3a996e --- /dev/null +++ b/tools/scraper/.gitignore @@ -0,0 +1,154 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +articles diff --git a/tools/scraper/requirements.txt b/tools/scraper/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e4836142e360e5cf4c554972b138c754c2683a9 --- /dev/null +++ b/tools/scraper/requirements.txt @@ -0,0 +1,14 @@ +autopep8==1.6.0 +beautifulsoup4==4.10.0 +bs4==0.0.1 +certifi==2021.10.8 +charset-normalizer==2.0.9 +idna==3.3 +lxml==4.7.1 +markdownify==0.10.1 +pycodestyle==2.8.0 +requests==2.26.0 +six==1.16.0 +soupsieve==2.3.1 +toml==0.10.2 +urllib3==1.26.7 diff --git a/tools/scraper/scrape-wordpress-blogs.py b/tools/scraper/scrape-wordpress-blogs.py new file mode 100644 index 0000000000000000000000000000000000000000..b4c809ee85b3227175925efd767263bce73db7ed --- /dev/null +++ b/tools/scraper/scrape-wordpress-blogs.py @@ -0,0 +1,74 @@ +import requests +from bs4 import BeautifulSoup +from datetime import datetime +import os +from pathlib import Path +import markdownify + +article_base_dir = "articles" +dir_path = os.path.dirname(os.path.realpath(__file__)) +base_path = Path(str(os.path.join(dir_path, article_base_dir))) + +base_path.mkdir(exist_ok=True) + +base_url = "https://kszk.bme.hu" +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} + +for page in range(1, 29): + base_request = None + if (page==1): + base_request = requests.get( + base_url, headers=headers) + else: + base_request = requests.get( + base_url + "/page/" + str(page) + "/", headers=headers) + base_soup = BeautifulSoup(base_request.content, features='lxml') + + articles = base_soup.find_all("article", class_="post") + + for article in articles: + article_link = article.find("a", href=True)['href'] + article_title = article.find("a", href=True).getText() + print(article_title) + article_tags = [] + try: + article_tags = map(lambda tag_link: tag_link.getText(), article.find( + "span", class_="tags-links").find_all("a", rel="tag")) + except: + pass + + article_slug = article_link.rsplit('/', 2)[1] + article_date = datetime.strptime(article.find( + "time")['datetime'], '%Y-%m-%dT%H:%M:%S%z') + article_author_link = article.find( + "a", rel="author", href=True)['href'] + + article_folder_path = Path.joinpath( + base_path, str(article_date.year), str( + article_date.month).zfill(2), str(article_date.day).zfill(2)) + article_path = Path.joinpath(article_folder_path, article_slug + ".md") + + article_folder_path.mkdir(parents=True, exist_ok=True) + + article_request = requests.get(article_link, headers=headers) + article_soup = BeautifulSoup(article_request.content, features='lxml') + article_content_as_html = article_soup.find( + "div", class_="entry-content") + article_content_as_md = markdownify.markdownify( + str(article_content_as_html), heading_style="ATX") + + content = """\ +--- +slug: {slug} +title: {title} +authors: {author_link} +tags: [{tags}] +--- + +{content} +""".format(slug=article_slug, title=article_title, author_link=article_author_link, tags=", ".join(list(article_tags)), content=article_content_as_md) + + article_file = open(str(article_path), "w") + article_file.write(content) + article_file.close()