Alles Habr in einer Basis

Guten Tag. Es ist 2 Jahre her, dass der letzte Artikel über das Parsen von Habr geschrieben wurde, und einige Punkte haben sich geändert.


Als ich eine Kopie des Habr haben wollte, beschloss ich, einen Parser zu schreiben, der den gesamten Inhalt der Autoren in einer Datenbank speichert. Wie es passiert ist und welche Fehler ich gemacht habe - Sie können es unter dem Schnitt lesen.


Teil 2 | mega.nz | Github


Die erste Version des Parsers. Ein Thread, viele Probleme


Zunächst entschied ich mich für ein Prototyp-Skript, in dem der Artikel sofort analysiert und in die Datenbank heruntergeladen wird. Ohne nachzudenken, habe ich sqlite3 verwendet, weil Es war weniger zeitaufwändig: Sie müssen keinen lokalen Server haben, der erstellt, gelöscht und ähnliches erstellt wurde.


one_thread.py
from bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime

def main(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content  TEXT, tags TEXT)")

    start_time = datetime.now()
    c.execute("begin")
    for i in range(min, max):
        url = "https://m.habr.com/post/{}".format(i)
        try:
            r = requests.get(url)
        except:
            with open("req_errors.txt") as file:
                file.write(i)
            continue
        if(r.status_code != 200):
            print("{} - {}".format(i, r.status_code))
            continue

        html_doc = r.text
        soup = BeautifulSoup(html_doc, 'html.parser')

        try:
            author = soup.find(class_="tm-user-info__username").get_text()
            content = soup.find(id="post-content-body")
            content = str(content)
            title = soup.find(class_="tm-article-title__text").get_text()
            tags = soup.find(class_="tm-article__tags").get_text()
            tags = tags[5:]
        except:
            author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
            content = "     ."

        c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
        print(i)
    c.execute("commit")
    print(datetime.now() - start_time)

main(1, 490406)

Alles ist klassisch - wir verwenden schöne Suppe, Anfragen und ein schneller Prototyp ist fertig. Das ist einfach ...


  • Das Laden der Seite erfolgt in einem Stream


  • Wenn Sie die Ausführung des Skripts unterbrechen, wird die gesamte Datenbank nirgendwo hingehen. Immerhin wird das Commit erst nach dem Parsen ausgeführt.
    Natürlich können Sie nach jedem Einfügen Änderungen an der Datenbank veröffentlichen, aber dann erhöht sich die Ausführungszeit des Skripts erheblich.


  • Das Parsen der ersten 100.000 Artikel dauerte 8 Stunden.



cointegrated, , :


  • .
  • , .
    , cointegrated 378 , 126 .

. ,


python, multiprocessing.dummy, , .


SQLite3 .
check_same_thread=False, , , .


, cointegrated, , . .


.
ip . 3 , , 100 26 12 .


, , .


three_threads_v1.py
from bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/post/{}".format(i)

    try: r = requests.get(url)
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    #     
    if (r.status_code == 503):
        with open("Error503.txt", "a") as write_file:
            write_file.write(str(i) + "\n")
            logging.warning('{} / 503 Error'.format(i))

    #        
    if (r.status_code != 200):
        logging.info("{} / {} Code".format(i, r.status_code))
        return r.status_code

    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html5lib')

    try:
        author = soup.find(class_="tm-user-info__username").get_text()

        timestamp = soup.find(class_='tm-user-meta__date')
        timestamp = timestamp['title']

        content = soup.find(id="post-content-body")
        content = str(content)
        title = soup.find(class_="tm-article-title__text").get_text()
        tags = soup.find(class_="tm-article__tags").get_text()
        tags = tags[5:]

        # ,      .
        tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()

        rating = soup.find(class_="tm-votes-score").get_text()
    except:
        author = title = tags = timestamp = tm_tag = rating = "Error" 
        content = "     ."
        logging.warning("Error parsing - {}".format(i))
        with open("Errors.txt", "a") as write_file:
            write_file.write(str(i) + "\n")

    #    json
    try:
        article = [i, timestamp, author, title, content, tm_tag, rating, tags]
        with open(currentFile, "w") as write_file:
            json.dump(article, write_file)
    except:
        print(i)
        raise

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("  min  max. : async_v1.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    #   >3
    #    ip  
    pool = ThreadPool(3)

    #  ,  
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    #      
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

.


, , , , API, . , , json, . .


, API, .


three_threads_v2.py
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)

    try:
        r = requests.get(url)
        if r.status_code == 503:
            logging.critical("503 Error")
            return 503
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    data = json.loads(r.text)

    if data['success']:
        article = data['data']['article']

        id = article['id']
        is_tutorial = article['is_tutorial']
        time_published = article['time_published']
        comments_count = article['comments_count']
        lang = article['lang']
        tags_string = article['tags_string']
        title = article['title']
        content = article['text_html']
        reading_count = article['reading_count']
        author = article['author']['login']
        score = article['voting']['score']

        data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
        with open(currentFile, "w") as write_file:
            json.dump(data, write_file)

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("  min  max. : asyc.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    #   >3
    #    ip  
    pool = ThreadPool(3)

    #  ,  
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    #      
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

, , , .


API.png


json , :


  • id
  • is_tutorial
  • time_published
  • title
  • content
  • comments_count
  • lang — , . en ru.
  • tags_string —
  • reading_count
  • author
  • score — .

, API, 8 100 url.


, , . :


three_threads_parser.py
import json
import sqlite3
import logging
from datetime import datetime

def parser(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute('PRAGMA synchronous = 0') #   ,     .
    c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT, \
    lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
    try:
        for i in range(min, max):
            try:
                filename = "files\\{}.json".format(i)
                f = open(filename)
                data = json.load(f)

                (id, is_tutorial, time_published, title, content, comments_count, lang,
                 tags_string, reading_count, author, score) = data

                #        .  ?
                #    ,      data.  .

                c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
                                                                                        title, content, lang,
                                                                                        comments_count, reading_count,
                                                                                        score, is_tutorial,
                                                                                        tags_string))
                f.close()

            except IOError:
                logging.info('FileNotExists')
                continue

    finally:
        conn.commit()

start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)


, :


  • 490 406 228 512 . , (261894) .
  • , , 2.95 . — 495 .
  • 37804 . , .
  • alizar — 8774 .
  • — 1448
  • — 1660841
  • — 2444

Top 15 Autoren

Top 15 bewertet

Top 15 Follower

15

All Articles