😠 👩🏻‍🚒 🤙🏿 All Habr in one base 🌏 👨🏻‍🏭 👩‍❤️‍👩

Good afternoon. It has been 2 years since the writing of the last article about parsing Habr, and some points have changed.

When I wanted to have a copy of the Habr, I decided to write a parser that would save all the content of the authors into a database. How it happened and what mistakes I met - you can read it under the cut.

Part 2 | mega.nz | Github

The first version of the parser. One thread, many problems

To begin with, I decided to make a prototype script in which the article would be parsed immediately and downloaded into the database. Without thinking twice, I used sqlite3, because it was less time-consuming: no need to have a local server, created-looked-deleted and all that sort of thing.

one_thread.py

from bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime

def main(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content  TEXT, tags TEXT)")

    start_time = datetime.now()
    c.execute("begin")
    for i in range(min, max):
        url = "https://m.habr.com/post/{}".format(i)
        try:
            r = requests.get(url)
        except:
            with open("req_errors.txt") as file:
                file.write(i)
            continue
        if(r.status_code != 200):
            print("{} - {}".format(i, r.status_code))
            continue

        html_doc = r.text
        soup = BeautifulSoup(html_doc, 'html.parser')

        try:
            author = soup.find(class_="tm-user-info__username").get_text()
            content = soup.find(id="post-content-body")
            content = str(content)
            title = soup.find(class_="tm-article-title__text").get_text()
            tags = soup.find(class_="tm-article__tags").get_text()
            tags = tags[5:]
        except:
            author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
            content = "     ."

        c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
        print(i)
    c.execute("commit")
    print(datetime.now() - start_time)

main(1, 490406)

Everything is classic - we use Beautiful Soup, requests and a quick prototype is ready. That's just ...

Page loading goes in one stream
If you interrupt the execution of the script, then the entire database will go nowhere. After all, the commit is executed only after all the parsing.
Of course, you can post changes to the database after each insertion, but then the execution time of the script will increase significantly.
Parsing the first 100,000 articles took me 8 hours.

cointegrated, , :

.
, .
, cointegrated 378 , 126 .

. ,

python, multiprocessing.dummy, , .

SQLite3 .
check_same_thread=False, , , .

, cointegrated, , . .

.
ip . 3 , , 100 26 12 .

, , .

three_threads_v1.py

from bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/post/{}".format(i)

    try: r = requests.get(url)
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    #     
    if (r.status_code == 503):
        with open("Error503.txt", "a") as write_file:
            write_file.write(str(i) + "\n")
            logging.warning('{} / 503 Error'.format(i))

    #        
    if (r.status_code != 200):
        logging.info("{} / {} Code".format(i, r.status_code))
        return r.status_code

    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html5lib')

    try:
        author = soup.find(class_="tm-user-info__username").get_text()

        timestamp = soup.find(class_='tm-user-meta__date')
        timestamp = timestamp['title']

        content = soup.find(id="post-content-body")
        content = str(content)
        title = soup.find(class_="tm-article-title__text").get_text()
        tags = soup.find(class_="tm-article__tags").get_text()
        tags = tags[5:]

        # ,      .
        tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()

        rating = soup.find(class_="tm-votes-score").get_text()
    except:
        author = title = tags = timestamp = tm_tag = rating = "Error" 
        content = "     ."
        logging.warning("Error parsing - {}".format(i))
        with open("Errors.txt", "a") as write_file:
            write_file.write(str(i) + "\n")

    #    json
    try:
        article = [i, timestamp, author, title, content, tm_tag, rating, tags]
        with open(currentFile, "w") as write_file:
            json.dump(article, write_file)
    except:
        print(i)
        raise

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("  min  max. : async_v1.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    #   >3
    #    ip  
    pool = ThreadPool(3)

    #  ,  
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    #      
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

.

, , , , API, . , , json, . .

, API, .

three_threads_v2.py

import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)

    try:
        r = requests.get(url)
        if r.status_code == 503:
            logging.critical("503 Error")
            return 503
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    data = json.loads(r.text)

    if data['success']:
        article = data['data']['article']

        id = article['id']
        is_tutorial = article['is_tutorial']
        time_published = article['time_published']
        comments_count = article['comments_count']
        lang = article['lang']
        tags_string = article['tags_string']
        title = article['title']
        content = article['text_html']
        reading_count = article['reading_count']
        author = article['author']['login']
        score = article['voting']['score']

        data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
        with open(currentFile, "w") as write_file:
            json.dump(data, write_file)

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("  min  max. : asyc.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    #   >3
    #    ip  
    pool = ThreadPool(3)

    #  ,  
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    #      
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

, , , .

API.png

json , :

id
is_tutorial
time_published
title
content
comments_count
lang — , . en ru.
tags_string —
reading_count
author
score — .

, API, 8 100 url.

, , . :

three_threads_parser.py

import json
import sqlite3
import logging
from datetime import datetime

def parser(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute('PRAGMA synchronous = 0') #   ,     .
    c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT, \
    lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
    try:
        for i in range(min, max):
            try:
                filename = "files\\{}.json".format(i)
                f = open(filename)
                data = json.load(f)

                (id, is_tutorial, time_published, title, content, comments_count, lang,
                 tags_string, reading_count, author, score) = data

                #        .  ?
                #    ,      data.  .

                c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
                                                                                        title, content, lang,
                                                                                        comments_count, reading_count,
                                                                                        score, is_tutorial,
                                                                                        tags_string))
                f.close()

            except IOError:
                logging.info('FileNotExists')
                continue

    finally:
        conn.commit()

start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)

, :

490 406 228 512 . , (261894) .
, , 2.95 . — 495 .
37804 . , .
— alizar — 8774 .
— 1448
— 1660841
— 2444

Top 15 Authors

Top 15 rated

Top 15 Followers

All Habr in one base

The first version of the parser. One thread, many problems

. ,

.

More articles: