Todo Habr en una base

Buenas tardes. Han pasado 2 años desde la redacción del último artículo sobre el análisis de Habr, y algunos puntos han cambiado.


Cuando quería tener una copia del Habr, decidí escribir un analizador que guardara todo el contenido de los autores en una base de datos. Cómo sucedió y qué errores me encontré: puedes leerlo debajo del corte.


Parte 2 | mega.nz | Github


La primera versión del analizador. Un hilo, muchos problemas


Para empezar, decidí hacer un prototipo de script en el que el artículo se analizaría inmediatamente y se descargaría en la base de datos. Sin pensarlo dos veces, usé sqlite3, porque consumía menos tiempo: no era necesario tener un servidor local, creado, parecía eliminado y todo ese tipo de cosas.


one_thread.py
from bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime

def main(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content  TEXT, tags TEXT)")

    start_time = datetime.now()
    c.execute("begin")
    for i in range(min, max):
        url = "https://m.habr.com/post/{}".format(i)
        try:
            r = requests.get(url)
        except:
            with open("req_errors.txt") as file:
                file.write(i)
            continue
        if(r.status_code != 200):
            print("{} - {}".format(i, r.status_code))
            continue

        html_doc = r.text
        soup = BeautifulSoup(html_doc, 'html.parser')

        try:
            author = soup.find(class_="tm-user-info__username").get_text()
            content = soup.find(id="post-content-body")
            content = str(content)
            title = soup.find(class_="tm-article-title__text").get_text()
            tags = soup.find(class_="tm-article__tags").get_text()
            tags = tags[5:]
        except:
            author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
            content = "     ."

        c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
        print(i)
    c.execute("commit")
    print(datetime.now() - start_time)

main(1, 490406)

Todo es clásico: utilizamos Beautiful Soup, solicitudes y un prototipo rápido está listo. Eso es solo ...


  • La carga de la página va en una secuencia


  • Si interrumpe la ejecución del script, la base de datos completa no irá a ninguna parte. Después de todo, la confirmación se ejecuta solo después de todo el análisis.
    Por supuesto, puede publicar cambios en la base de datos después de cada inserción, pero luego el tiempo de ejecución del script aumentará significativamente.


  • Analizar los primeros 100,000 artículos me llevó 8 horas.



cointegrated, , :


  • .
  • , .
    , cointegrated 378 , 126 .

. ,


python, multiprocessing.dummy, , .


SQLite3 .
check_same_thread=False, , , .


, cointegrated, , . .


.
ip . 3 , , 100 26 12 .


, , .


three_threads_v1.py
from bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/post/{}".format(i)

    try: r = requests.get(url)
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    #     
    if (r.status_code == 503):
        with open("Error503.txt", "a") as write_file:
            write_file.write(str(i) + "\n")
            logging.warning('{} / 503 Error'.format(i))

    #        
    if (r.status_code != 200):
        logging.info("{} / {} Code".format(i, r.status_code))
        return r.status_code

    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html5lib')

    try:
        author = soup.find(class_="tm-user-info__username").get_text()

        timestamp = soup.find(class_='tm-user-meta__date')
        timestamp = timestamp['title']

        content = soup.find(id="post-content-body")
        content = str(content)
        title = soup.find(class_="tm-article-title__text").get_text()
        tags = soup.find(class_="tm-article__tags").get_text()
        tags = tags[5:]

        # ,      .
        tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()

        rating = soup.find(class_="tm-votes-score").get_text()
    except:
        author = title = tags = timestamp = tm_tag = rating = "Error" 
        content = "     ."
        logging.warning("Error parsing - {}".format(i))
        with open("Errors.txt", "a") as write_file:
            write_file.write(str(i) + "\n")

    #    json
    try:
        article = [i, timestamp, author, title, content, tm_tag, rating, tags]
        with open(currentFile, "w") as write_file:
            json.dump(article, write_file)
    except:
        print(i)
        raise

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("  min  max. : async_v1.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    #   >3
    #    ip  
    pool = ThreadPool(3)

    #  ,  
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    #      
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

.


, , , , API, . , , json, . .


, API, .


three_threads_v2.py
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)

    try:
        r = requests.get(url)
        if r.status_code == 503:
            logging.critical("503 Error")
            return 503
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    data = json.loads(r.text)

    if data['success']:
        article = data['data']['article']

        id = article['id']
        is_tutorial = article['is_tutorial']
        time_published = article['time_published']
        comments_count = article['comments_count']
        lang = article['lang']
        tags_string = article['tags_string']
        title = article['title']
        content = article['text_html']
        reading_count = article['reading_count']
        author = article['author']['login']
        score = article['voting']['score']

        data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
        with open(currentFile, "w") as write_file:
            json.dump(data, write_file)

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("  min  max. : asyc.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    #   >3
    #    ip  
    pool = ThreadPool(3)

    #  ,  
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    #      
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

, , , .


API.png


json , :


  • id
  • is_tutorial
  • time_published
  • title
  • content
  • comments_count
  • lang — , . en ru.
  • tags_string —
  • reading_count
  • author
  • score — .

, API, 8 100 url.


, , . :


three_threads_parser.py
import json
import sqlite3
import logging
from datetime import datetime

def parser(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute('PRAGMA synchronous = 0') #   ,     .
    c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT, \
    lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
    try:
        for i in range(min, max):
            try:
                filename = "files\\{}.json".format(i)
                f = open(filename)
                data = json.load(f)

                (id, is_tutorial, time_published, title, content, comments_count, lang,
                 tags_string, reading_count, author, score) = data

                #        .  ?
                #    ,      data.  .

                c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
                                                                                        title, content, lang,
                                                                                        comments_count, reading_count,
                                                                                        score, is_tutorial,
                                                                                        tags_string))
                f.close()

            except IOError:
                logging.info('FileNotExists')
                continue

    finally:
        conn.commit()

start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)


, :


  • 490 406 228 512 . , (261894) .
  • , , 2.95 . — 495 .
  • 37804 . , .
  • alizar — 8774 .
  • — 1448
  • — 1660841
  • — 2444

15 autores principales

Los 15 mejores clasificados

Top 15 seguidores

15

All Articles