Guten Tag. Es ist 2 Jahre her, dass der letzte Artikel über das Parsen von Habr geschrieben wurde, und einige Punkte haben sich geändert.
Als ich eine Kopie des Habr haben wollte, beschloss ich, einen Parser zu schreiben, der den gesamten Inhalt der Autoren in einer Datenbank speichert. Wie es passiert ist und welche Fehler ich gemacht habe - Sie können es unter dem Schnitt lesen.
Teil 2 | mega.nz | Github
Die erste Version des Parsers. Ein Thread, viele Probleme
Zunächst entschied ich mich für ein Prototyp-Skript, in dem der Artikel sofort analysiert und in die Datenbank heruntergeladen wird. Ohne nachzudenken, habe ich sqlite3 verwendet, weil Es war weniger zeitaufwändig: Sie müssen keinen lokalen Server haben, der erstellt, gelöscht und ähnliches erstellt wurde.
one_thread.pyfrom bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime
def main(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content TEXT, tags TEXT)")
start_time = datetime.now()
c.execute("begin")
for i in range(min, max):
url = "https://m.habr.com/post/{}".format(i)
try:
r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
continue
if(r.status_code != 200):
print("{} - {}".format(i, r.status_code))
continue
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html.parser')
try:
author = soup.find(class_="tm-user-info__username").get_text()
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
except:
author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
content = " ."
c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
print(i)
c.execute("commit")
print(datetime.now() - start_time)
main(1, 490406)
Alles ist klassisch - wir verwenden schöne Suppe, Anfragen und ein schneller Prototyp ist fertig. Das ist einfach ...
Das Laden der Seite erfolgt in einem Stream
Wenn Sie die Ausführung des Skripts unterbrechen, wird die gesamte Datenbank nirgendwo hingehen. Immerhin wird das Commit erst nach dem Parsen ausgeführt.
Natürlich können Sie nach jedem Einfügen Änderungen an der Datenbank veröffentlichen, aber dann erhöht sich die Ausführungszeit des Skripts erheblich.
Das Parsen der ersten 100.000 Artikel dauerte 8 Stunden.
cointegrated, , :
- .
- , .
, cointegrated 378 , 126 .
. ,
python, multiprocessing.dummy, , .
SQLite3 .
check_same_thread=False
, , , .
, cointegrated, , . .
.
ip . 3 , , 100 26 12 .
, , .
three_threads_v1.pyfrom bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/post/{}".format(i)
try: r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
if (r.status_code == 503):
with open("Error503.txt", "a") as write_file:
write_file.write(str(i) + "\n")
logging.warning('{} / 503 Error'.format(i))
if (r.status_code != 200):
logging.info("{} / {} Code".format(i, r.status_code))
return r.status_code
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html5lib')
try:
author = soup.find(class_="tm-user-info__username").get_text()
timestamp = soup.find(class_='tm-user-meta__date')
timestamp = timestamp['title']
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()
rating = soup.find(class_="tm-votes-score").get_text()
except:
author = title = tags = timestamp = tm_tag = rating = "Error"
content = " ."
logging.warning("Error parsing - {}".format(i))
with open("Errors.txt", "a") as write_file:
write_file.write(str(i) + "\n")
try:
article = [i, timestamp, author, title, content, tm_tag, rating, tags]
with open(currentFile, "w") as write_file:
json.dump(article, write_file)
except:
print(i)
raise
if __name__ == '__main__':
if len(sys.argv) < 3:
print(" min max. : async_v1.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
pool = ThreadPool(3)
start_time = datetime.now()
results = pool.map(worker, range(min, max))
pool.close()
pool.join()
print(datetime.now() - start_time)
.
, , , , API, . , , json, . .
, API, .
three_threads_v2.pyimport requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)
try:
r = requests.get(url)
if r.status_code == 503:
logging.critical("503 Error")
return 503
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
data = json.loads(r.text)
if data['success']:
article = data['data']['article']
id = article['id']
is_tutorial = article['is_tutorial']
time_published = article['time_published']
comments_count = article['comments_count']
lang = article['lang']
tags_string = article['tags_string']
title = article['title']
content = article['text_html']
reading_count = article['reading_count']
author = article['author']['login']
score = article['voting']['score']
data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
with open(currentFile, "w") as write_file:
json.dump(data, write_file)
if __name__ == '__main__':
if len(sys.argv) < 3:
print(" min max. : asyc.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
pool = ThreadPool(3)
start_time = datetime.now()
results = pool.map(worker, range(min, max))
pool.close()
pool.join()
print(datetime.now() - start_time)
, , , .
json , :
- id
- is_tutorial
- time_published
- title
- content
- comments_count
- lang — , . en ru.
- tags_string —
- reading_count
- author
- score — .
, API, 8 100 url.
, , . :
three_threads_parser.pyimport json
import sqlite3
import logging
from datetime import datetime
def parser(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute('PRAGMA synchronous = 0')
c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT, \
lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
try:
for i in range(min, max):
try:
filename = "files\\{}.json".format(i)
f = open(filename)
data = json.load(f)
(id, is_tutorial, time_published, title, content, comments_count, lang,
tags_string, reading_count, author, score) = data
c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
title, content, lang,
comments_count, reading_count,
score, is_tutorial,
tags_string))
f.close()
except IOError:
logging.info('FileNotExists')
continue
finally:
conn.commit()
start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)
, :
- 490 406 228 512 . , (261894) .
- , , 2.95 . — 495 .
- 37804 . , .
- — alizar — 8774 .
- — 1448
- — 1660841
- — 2444