Boa tarde. Faz dois anos desde a redação do último artigo sobre a análise de Habr, e alguns pontos mudaram.
Quando queria ter uma cópia do Habr, decidi escrever um analisador que salvaria todo o conteúdo dos autores em um banco de dados. Como aconteceu e que erros eu encontrei - você pode ler por baixo.
Parte 2 | mega.nz | Github
A primeira versão do analisador. Uma discussão, muitos problemas
Para começar, decidi criar um script protótipo no qual o artigo seria analisado imediatamente e baixado no banco de dados. Sem pensar duas vezes, usei o sqlite3, porque consumia menos tempo: você não precisa ter um servidor local, criado-parecia-excluído e coisas assim.
one_thread.pyfrom bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime
def main(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content  TEXT, tags TEXT)")
    start_time = datetime.now()
    c.execute("begin")
    for i in range(min, max):
        url = "https://m.habr.com/post/{}".format(i)
        try:
            r = requests.get(url)
        except:
            with open("req_errors.txt") as file:
                file.write(i)
            continue
        if(r.status_code != 200):
            print("{} - {}".format(i, r.status_code))
            continue
        html_doc = r.text
        soup = BeautifulSoup(html_doc, 'html.parser')
        try:
            author = soup.find(class_="tm-user-info__username").get_text()
            content = soup.find(id="post-content-body")
            content = str(content)
            title = soup.find(class_="tm-article-title__text").get_text()
            tags = soup.find(class_="tm-article__tags").get_text()
            tags = tags[5:]
        except:
            author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
            content = "     ."
        c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
        print(i)
    c.execute("commit")
    print(datetime.now() - start_time)
main(1, 490406)
 Tudo é clássico - usamos Beautiful Soup, solicitações e um protótipo rápido está pronto. Isso é só ...
- O carregamento da página ocorre em um fluxo 
 
- Se você interromper a execução do script, o banco de dados inteiro não chegará a lugar algum. Afinal, a consolidação é executada somente após toda a análise.
 Obviamente, você pode postar alterações no banco de dados após cada inserção, mas o tempo de execução do script aumentará significativamente.
 
 
- Analisando os primeiros 100.000 artigos, levei 8 horas. 
 
cointegrated, , :
- .
- , .
 , cointegrated 378 , 126 .
. ,
python, multiprocessing.dummy, , .
SQLite3 .
check_same_thread=False, , , .
, cointegrated, , . .
.
ip . 3 , , 100 26 12 .
, , .
three_threads_v1.pyfrom bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
    currentFile = "files\\{}.json".format(i)
    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1
    url = "https://m.habr.com/post/{}".format(i)
    try: r = requests.get(url)
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2
    
    if (r.status_code == 503):
        with open("Error503.txt", "a") as write_file:
            write_file.write(str(i) + "\n")
            logging.warning('{} / 503 Error'.format(i))
    
    if (r.status_code != 200):
        logging.info("{} / {} Code".format(i, r.status_code))
        return r.status_code
    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html5lib')
    try:
        author = soup.find(class_="tm-user-info__username").get_text()
        timestamp = soup.find(class_='tm-user-meta__date')
        timestamp = timestamp['title']
        content = soup.find(id="post-content-body")
        content = str(content)
        title = soup.find(class_="tm-article-title__text").get_text()
        tags = soup.find(class_="tm-article__tags").get_text()
        tags = tags[5:]
        
        tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()
        rating = soup.find(class_="tm-votes-score").get_text()
    except:
        author = title = tags = timestamp = tm_tag = rating = "Error" 
        content = "     ."
        logging.warning("Error parsing - {}".format(i))
        with open("Errors.txt", "a") as write_file:
            write_file.write(str(i) + "\n")
    
    try:
        article = [i, timestamp, author, title, content, tm_tag, rating, tags]
        with open(currentFile, "w") as write_file:
            json.dump(article, write_file)
    except:
        print(i)
        raise
if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("  min  max. : async_v1.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])
    
    
    pool = ThreadPool(3)
    
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))
    
    pool.close()
    pool.join()
    print(datetime.now() - start_time)
 .
, , , , API, . , , json, . .
, API, .
three_threads_v2.pyimport requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
    currentFile = "files\\{}.json".format(i)
    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1
    url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)
    try:
        r = requests.get(url)
        if r.status_code == 503:
            logging.critical("503 Error")
            return 503
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2
    data = json.loads(r.text)
    if data['success']:
        article = data['data']['article']
        id = article['id']
        is_tutorial = article['is_tutorial']
        time_published = article['time_published']
        comments_count = article['comments_count']
        lang = article['lang']
        tags_string = article['tags_string']
        title = article['title']
        content = article['text_html']
        reading_count = article['reading_count']
        author = article['author']['login']
        score = article['voting']['score']
        data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
        with open(currentFile, "w") as write_file:
            json.dump(data, write_file)
if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("  min  max. : asyc.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])
    
    
    pool = ThreadPool(3)
    
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))
    
    pool.close()
    pool.join()
    print(datetime.now() - start_time)
 , , , .
json , :
- id
- is_tutorial
- time_published
- title
- content
- comments_count
- lang — , . en ru.
- tags_string —
- reading_count
- author
- score — .
, API, 8 100 url.
, , . :
three_threads_parser.pyimport json
import sqlite3
import logging
from datetime import datetime
def parser(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute('PRAGMA synchronous = 0') 
    c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT, \
    lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
    try:
        for i in range(min, max):
            try:
                filename = "files\\{}.json".format(i)
                f = open(filename)
                data = json.load(f)
                (id, is_tutorial, time_published, title, content, comments_count, lang,
                 tags_string, reading_count, author, score) = data
                
                
                c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
                                                                                        title, content, lang,
                                                                                        comments_count, reading_count,
                                                                                        score, is_tutorial,
                                                                                        tags_string))
                f.close()
            except IOError:
                logging.info('FileNotExists')
                continue
    finally:
        conn.commit()
start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)
 , :
- 490 406 228 512 . , (261894) .
- , , 2.95 . — 495 .
- 37804 . , .
- — alizar — 8774 .
- — 1448
- — 1660841
- — 2444