Análise da taxa de propagação COVID-19 e publicação dos resultados em dstack.ai

Prefácio do tradutor



Olá pessoal, não escrevo nada para Habr há muito tempo, mas havia um motivo. Trabalho no campo da análise de dados há alguns anos e a coisa mais importante que entendi durante esse período é que existem muito poucas ferramentas na análise de dados, uma grande variedade de ferramentas. Eu estava preocupado com várias coisas, uma das quais era a dificuldade que um especialista em análise de dados enfrenta ao tentar compartilhar os resultados de seu trabalho com um gerente ou mesmo um colega no chão de fábrica. Normalmente, qualquer ferramenta disponível, email, mensagens instantâneas, caixa de depósito etc. são usadas aqui. Andrei e outro amigo nosso decidiram tentar fazer algo significativo nessa área e hoje quero falar sobre o que fizemos. Na situação em que todos nós nos encontramos devido ao vírus COVID-19, o problema de publicar e discutir resultados de pesquisa se tornou, talvez, ainda mais relevante.do que nunca.


, , dstack.ai , . , , , , - , .
, vitaly at dstack.ai.



COVID-19 — , SARS-CoV-2 (2019-nCoV). — , /, .



, .

, , .

: |

COVID-19


, . dstack.ai, , , .


COVID-19 . COVID-19 , , .


, , , , Python , COVID-19 , , .
, ...


  • Python, pandas plot.ly . , data science Python .
  • COVID-19.
  • Python dstack.ai API , .

, . , , andrey at dstack.ai.


, Python Jupyter , Jupyter, Python.
, — . , : pandas , plot.ly .


pandas – de facto Python, (, , Matplotlib and Bokeh). plotly.express – , plotly , ( plotly – ).
dstack.ai, dstack Python ( conda – ).


import pandas as pd
import plotly.express as px
from dstack import create_frame

, – , , . , , , , pandas. – , . COVID-19, .


, , , COVID-19 /, . , URL pandas, :


url = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_19-covid-Confirmed.csv&filename=time_series_2019-ncov-Confirmed.csv"
df = pd.read_csv(url) #    URL
df.head() #   5 ,     

dados baixados


( , , , , , – ).


, – COVID-19 . , , pandas API : – ( , , – ):


#   (2-  )    
cols = [df.columns[1]] + list(df.columns[-2:]) 
last_2_days = df[df["Province/State"].isnull()][cols].copy() # 
last_2_days # ,    


, , , . , : , :


d1 = last_2_days.columns[-1]  #  
d2 = last_2_days.columns[-2]  #  
last_2_days["delta"] = last_2_days[d1] - last_2_days[d2]
last_2_days["delta%"] = last_2_days["delta"] / last_2_days[d2]
last_2_days


, . , dstack. API, pandas , plotly ( ).


dstack – , , : , , .


dstack.ai , .


, pandas () plotly , dstack, , . URL https://dstack.ai/<user>/<stack>. . – , . – . (attachments). ( , , , - , – ).


. , . :


  1. covid19/speed ( , – ). , : https://dstack.ai/<user>/covid19/speed.

, dstack.


dstack config --token <token> --user <user>

dstack , .dstack/config.yaml (.. , – ). , docs.dstack.ai.


  1. (.. – ), , , . (commited) .


  2. (push) :



min_cases = 50
#    
top_speed_frame = create_frame("covid19/speed")
# 
sort_by_cols = ["delta", "delta%"]
for col in sort_by_cols:
    top = last_2_days[last_2_days[last_2_days.columns[1]] > min_cases].
                 sort_values(by=[col], ascending=False).head(50)
    #    
    top_speed_frame.commit(top, f"Top 50 countries with the " \
                                f"fastest growing number of confirmed " \
                                f"Covid-19 cases (at least {min_cases})", 
                                {"Sort by": col})

top_speed_frame.push()

: https://dstack.ai/cheptsov/covid19/speed.



- , , . , . .


, , – , :


#  
cdf = df[(df["Country/Region"]=="Italy") & (df["Province/State"].isnull())][df.columns[4:]].T
#    
cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"}) 

, , . , x , y :


fig = px.line(cdf, x=cdf.index, y="confirmed")
fig.show()


- . ?


, , .
. pandas API:


delta = (cdf.shift(-1) - cdf)
delta.tail() # ,     ,  


, , , , , ( ):


fig = px.line(delta, x=delta.index, y="confirmed")
fig.show()


, , pandas. – , , .


3 (figures) plotly: , , :


def plots_by_country(country):
    cdf = df[(df["Country/Region"]==country) & (df["Province/State"].isnull())][df.columns[4:]].T
    cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"})
    cfig = px.line(cdf, x=cdf.index, y="confirmed")
    delta = (cdf.shift(-1) - cdf).rename(columns={"confirmed": "confirmed per day"})
    cdfig = px.line(delta, x=cdf.index, y="confirmed per day")
    delta_p = ((cdf.shift(-1) - cdf) / cdf.shift(-1)).rename(columns={"confirmed": "confirmed per day %"})
    cdpfig = px.line(delta_p, x=cdf.index, y="confirmed per day %")
    return (cfig, cdfig, cdpfig)

, , :


(fig1, fig2, fig3) = plots_by_country("Austria")
fig1.show()
fig2.show()
fig3.show()




30 . , (dashboards) . :


#  30       
countries = df[df["Province/State"].isnull()].sort_values(by=[df.columns[-1]], ascending=False)[["Country/Region"]].head(30)

#           
frame = create_frame("covid19/speed_by_country")
for c in countries["Country/Region"].tolist():
    print(c)
    (fig1, fig2, fig3) = plots_by_country(c)
    frame.commit(fig1, f"Confirmed cases in {c}", {"Country": c, "Chart": "All cases"})
    frame.commit(fig2, f"New confirmed cases in {c}", {"Country": c, "Chart": "New cases"})
    frame.commit(fig3, f"New confirmed cases in {c} in %", {"Country": c, "Chart": "New cases (%)"})

frame.push()

https://dstack.ai/cheptsov/covid19/speed_by_country.



, , , , , , ( , top 10 – ) .


: ( – ):


t1 = df[(df["Country/Region"]=="Italy") & (df["Province/State"].isnull())][df.columns[4:]].T
t1 = t1.rename(columns={t1.columns[0]:"confirmed"})
t1.reset_index() #    
t1["Country/Region"] = "Italy" #    
t1.tail() # ,  


, , , :


def country_df(country):
    cdf = df[(df["Country/Region"]==country) & (df["Province/State"].isnull())][df.columns[4:]].T
    cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"})
    delta = (cdf.shift(-1) - cdf).rename(columns={"confirmed": "confirmed per day"})
    delta.reset_index()
    delta["Country/Region"] = country
    delta_p = ((cdf.shift(-1) - cdf) / cdf.shift(-1)).rename(columns={"confirmed": "confirmed per day %"})
    delta_p.reset_index()
    delta_p["Country/Region"] = country
    cdf.reset_index()
    cdf["Country/Region"] = country
    return (cdf, delta, delta_p)

10 , 3 , , , :


#  10     -   
top10 = df[df["Province/State"].isnull()].sort_values(by=[df.columns[-1]], ascending=False)[["Country/Region"]].head(10)

# ,       
top = []
top_delta = []
top_delta_p = []
for c in top10["Country/Region"].tolist():
    (x, y, z) = country_df(c)
    top.append(x)
    top_delta.append(y)
    top_delta_p.append(z)

test = pd.concat(top) #     
#  
px.line(test, x=test.index, y="confirmed", color='Country/Region').show()


, , , :


  • 10 .
  • 10 .
  • 10 .
  • 30 .
  • , , .
  • , .

:



frame = create_frame("covid19/speed_by_country_all")

top10df = pd.concat(top)
fig = px.line(top10df, x=top10df.index, y="confirmed", color='Country/Region')
frame.commit(fig, "Confirmed cases in top 10 countries", {"Country": "Top 10", "Chart": "All cases"})

top10df_delta = pd.concat(top_delta)
fig = px.line(top10df_delta, x=top10df_delta.index, y="confirmed per day", color='Country/Region')
frame.commit(fig, "New confirmed cases in top 10 countries", {"Country": "Top 10", "Chart": "New cases"})

top10df_delta_p = pd.concat(top_delta_p)
fig = px.line(top10df_delta_p, x=top10df_delta_p.index, y="confirmed per day %", color='Country/Region')
frame.commit(fig, "New confirmed cases in top 10 countries in %", {"Country": "Top 10", "Chart": "New cases (%)"})

for c in countries["Country/Region"].tolist():
    print(c)
    (fig1, fig2, fig3) = plots_by_country(c)
    frame.commit(fig1, f"Confirmed cases in {c}", {"Country": c, "Chart": "All cases"})
    frame.commit(fig2, f"New confirmed cases in {c}", {"Country": c, "Chart": "New cases"})
    frame.commit(fig3, f"New confirmed cases in {c} in %", {"Country": c, "Chart": "New cases (%)"})

frame.push()

: https://dstack.ai/cheptsov/covid19/speed_by_country_all.


. , , , . GitHub ( , , , – ) .


, :



All Articles