Bonne après-midi. Cela fait 2 ans depuis la rédaction du dernier article sur l'analyse syntaxique de Habr, et certains points ont changé.
Quand j'ai voulu avoir une copie du Habr, j'ai décidé d'écrire un analyseur qui enregistrerait tout le contenu des auteurs dans une base de données. Comment c'est arrivé et quelles erreurs j'ai rencontrées - vous pouvez le lire sous la coupe.
Partie 2 | mega.nz | Github
La première version de l'analyseur. Un fil, beaucoup de problèmes
Pour commencer, j'ai décidé de créer un prototype de script dans lequel l'article serait analysé immédiatement et téléchargé dans la base de données. Sans y réfléchir à deux fois, j'ai utilisé sqlite3, car cela prenait moins de temps: vous n'avez pas besoin d'avoir un serveur local, créé-regardé-supprimé et des trucs comme ça.
one_thread.pyfrom bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime
def main(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content TEXT, tags TEXT)")
start_time = datetime.now()
c.execute("begin")
for i in range(min, max):
url = "https://m.habr.com/post/{}".format(i)
try:
r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
continue
if(r.status_code != 200):
print("{} - {}".format(i, r.status_code))
continue
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html.parser')
try:
author = soup.find(class_="tm-user-info__username").get_text()
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
except:
author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
content = " ."
c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
print(i)
c.execute("commit")
print(datetime.now() - start_time)
main(1, 490406)
Tout est classique - nous utilisons Beautiful Soup, les demandes et un prototype rapide est prêt. C'est juste ...
Le chargement de la page se fait en un seul flux
Si vous interrompez l'exécution du script, la base de données entière n'ira nulle part. Après tout, la validation n'est exécutée qu'après toute l'analyse.
Bien sûr, vous pouvez publier des modifications dans la base de données après chaque insertion, mais le temps d'exécution du script augmentera considérablement.
L'analyse des 100 000 premiers articles m'a pris 8 heures.
cointegrated, , :
- .
- , .
, cointegrated 378 , 126 .
. ,
python, multiprocessing.dummy, , .
SQLite3 .
check_same_thread=False
, , , .
, cointegrated, , . .
.
ip . 3 , , 100 26 12 .
, , .
three_threads_v1.pyfrom bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/post/{}".format(i)
try: r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
if (r.status_code == 503):
with open("Error503.txt", "a") as write_file:
write_file.write(str(i) + "\n")
logging.warning('{} / 503 Error'.format(i))
if (r.status_code != 200):
logging.info("{} / {} Code".format(i, r.status_code))
return r.status_code
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html5lib')
try:
author = soup.find(class_="tm-user-info__username").get_text()
timestamp = soup.find(class_='tm-user-meta__date')
timestamp = timestamp['title']
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()
rating = soup.find(class_="tm-votes-score").get_text()
except:
author = title = tags = timestamp = tm_tag = rating = "Error"
content = " ."
logging.warning("Error parsing - {}".format(i))
with open("Errors.txt", "a") as write_file:
write_file.write(str(i) + "\n")
try:
article = [i, timestamp, author, title, content, tm_tag, rating, tags]
with open(currentFile, "w") as write_file:
json.dump(article, write_file)
except:
print(i)
raise
if __name__ == '__main__':
if len(sys.argv) < 3:
print(" min max. : async_v1.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
pool = ThreadPool(3)
start_time = datetime.now()
results = pool.map(worker, range(min, max))
pool.close()
pool.join()
print(datetime.now() - start_time)
.
, , , , API, . , , json, . .
, API, .
three_threads_v2.pyimport requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)
try:
r = requests.get(url)
if r.status_code == 503:
logging.critical("503 Error")
return 503
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
data = json.loads(r.text)
if data['success']:
article = data['data']['article']
id = article['id']
is_tutorial = article['is_tutorial']
time_published = article['time_published']
comments_count = article['comments_count']
lang = article['lang']
tags_string = article['tags_string']
title = article['title']
content = article['text_html']
reading_count = article['reading_count']
author = article['author']['login']
score = article['voting']['score']
data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
with open(currentFile, "w") as write_file:
json.dump(data, write_file)
if __name__ == '__main__':
if len(sys.argv) < 3:
print(" min max. : asyc.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
pool = ThreadPool(3)
start_time = datetime.now()
results = pool.map(worker, range(min, max))
pool.close()
pool.join()
print(datetime.now() - start_time)
, , , .
json , :
- id
- is_tutorial
- time_published
- title
- content
- comments_count
- lang — , . en ru.
- tags_string —
- reading_count
- author
- score — .
, API, 8 100 url.
, , . :
three_threads_parser.pyimport json
import sqlite3
import logging
from datetime import datetime
def parser(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute('PRAGMA synchronous = 0')
c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT, \
lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
try:
for i in range(min, max):
try:
filename = "files\\{}.json".format(i)
f = open(filename)
data = json.load(f)
(id, is_tutorial, time_published, title, content, comments_count, lang,
tags_string, reading_count, author, score) = data
c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
title, content, lang,
comments_count, reading_count,
score, is_tutorial,
tags_string))
f.close()
except IOError:
logging.info('FileNotExists')
continue
finally:
conn.commit()
start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)
, :
- 490 406 228 512 . , (261894) .
- , , 2.95 . — 495 .
- 37804 . , .
- — alizar — 8774 .
- — 1448
- — 1660841
- — 2444