Análisis de la tasa de propagación de COVID-19 y publicación de los resultados en dstack.ai

Prefacio del traductor



Hola a todos, hace mucho tiempo que no le escribo a Habr, pero había una razón. Trabajé en el campo del análisis de datos durante algunos años y lo más importante que entendí durante este tiempo es que hay muy pocas herramientas en el análisis de datos, una amplia variedad de herramientas. Me preocupaban varias cosas, una de las cuales era la dificultad que enfrenta el especialista en análisis de datos al tratar de compartir los resultados de su trabajo con un gerente o incluso un colega en el taller. Por lo general, aquí se utilizan las herramientas disponibles, correo electrónico, mensajería instantánea, Dropbox, etc. Andrei y otro amigo nuestro decidieron intentar hacer algo significativo en esta área y hoy quiero contarles lo que hicimos. En la situación en la que todos nos encontramos debido al virus COVID-19, el problema de publicar y discutir los resultados de la investigación se ha vuelto, quizás, aún más urgente,que nunca antes.


, , dstack.ai , . , , , , - , .
, vitaly at dstack.ai.



COVID-19 — , SARS-CoV-2 (2019-nCoV). — , /, .



, .

, , .

: |

COVID-19


, . dstack.ai, , , .


COVID-19 . COVID-19 , , .


, , , , Python , COVID-19 , , .
, ...


  • Python, pandas plot.ly . , data science Python .
  • COVID-19.
  • Python dstack.ai API , .

, . , , andrey at dstack.ai.


, Python Jupyter , Jupyter, Python.
, — . , : pandas , plot.ly .


pandas – de facto Python, (, , Matplotlib and Bokeh). plotly.express – , plotly , ( plotly – ).
dstack.ai, dstack Python ( conda ).


import pandas as pd
import plotly.express as px
from dstack import create_frame

, – , , . , , , , pandas. – , . COVID-19, .


, , , COVID-19 /, . , URL pandas, :


url = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_19-covid-Confirmed.csv&filename=time_series_2019-ncov-Confirmed.csv"
df = pd.read_csv(url) #    URL
df.head() #   5 ,     

datos descargados


( , , , , , – ).


, – COVID-19 . , , pandas API : – ( , , – ):


#   (2-  )    
cols = [df.columns[1]] + list(df.columns[-2:]) 
last_2_days = df[df["Province/State"].isnull()][cols].copy() # 
last_2_days # ,    


, , , . , : , :


d1 = last_2_days.columns[-1]  #  
d2 = last_2_days.columns[-2]  #  
last_2_days["delta"] = last_2_days[d1] - last_2_days[d2]
last_2_days["delta%"] = last_2_days["delta"] / last_2_days[d2]
last_2_days


, . , dstack. API, pandas , plotly ( ).


dstack – , , : , , .


dstack.ai , .


, pandas () plotly , dstack, , . URL https://dstack.ai/<user>/<stack>. . – , . – . (attachments). ( , , , - , – ).


. , . :


  1. covid19/speed ( , – ). , : https://dstack.ai/<user>/covid19/speed.

, dstack.


dstack config --token <token> --user <user>

dstack , .dstack/config.yaml (.. , – ). , docs.dstack.ai.


  1. (.. – ), , , . (commited) .


  2. (push) :



min_cases = 50
#    
top_speed_frame = create_frame("covid19/speed")
# 
sort_by_cols = ["delta", "delta%"]
for col in sort_by_cols:
    top = last_2_days[last_2_days[last_2_days.columns[1]] > min_cases].
                 sort_values(by=[col], ascending=False).head(50)
    #    
    top_speed_frame.commit(top, f"Top 50 countries with the " \
                                f"fastest growing number of confirmed " \
                                f"Covid-19 cases (at least {min_cases})", 
                                {"Sort by": col})

top_speed_frame.push()

: https://dstack.ai/cheptsov/covid19/speed.



- , , . , . .


, , – , :


#  
cdf = df[(df["Country/Region"]=="Italy") & (df["Province/State"].isnull())][df.columns[4:]].T
#    
cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"}) 

, , . , x , y :


fig = px.line(cdf, x=cdf.index, y="confirmed")
fig.show()


- . ?


, , .
. pandas API:


delta = (cdf.shift(-1) - cdf)
delta.tail() # ,     ,  


, , , , , ( ):


fig = px.line(delta, x=delta.index, y="confirmed")
fig.show()


, , pandas. – , , .


3 (figures) plotly: , , :


def plots_by_country(country):
    cdf = df[(df["Country/Region"]==country) & (df["Province/State"].isnull())][df.columns[4:]].T
    cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"})
    cfig = px.line(cdf, x=cdf.index, y="confirmed")
    delta = (cdf.shift(-1) - cdf).rename(columns={"confirmed": "confirmed per day"})
    cdfig = px.line(delta, x=cdf.index, y="confirmed per day")
    delta_p = ((cdf.shift(-1) - cdf) / cdf.shift(-1)).rename(columns={"confirmed": "confirmed per day %"})
    cdpfig = px.line(delta_p, x=cdf.index, y="confirmed per day %")
    return (cfig, cdfig, cdpfig)

, , :


(fig1, fig2, fig3) = plots_by_country("Austria")
fig1.show()
fig2.show()
fig3.show()




30 . , (dashboards) . :


#  30       
countries = df[df["Province/State"].isnull()].sort_values(by=[df.columns[-1]], ascending=False)[["Country/Region"]].head(30)

#           
frame = create_frame("covid19/speed_by_country")
for c in countries["Country/Region"].tolist():
    print(c)
    (fig1, fig2, fig3) = plots_by_country(c)
    frame.commit(fig1, f"Confirmed cases in {c}", {"Country": c, "Chart": "All cases"})
    frame.commit(fig2, f"New confirmed cases in {c}", {"Country": c, "Chart": "New cases"})
    frame.commit(fig3, f"New confirmed cases in {c} in %", {"Country": c, "Chart": "New cases (%)"})

frame.push()

https://dstack.ai/cheptsov/covid19/speed_by_country.



, , , , , , ( , top 10 – ) .


: ( – ):


t1 = df[(df["Country/Region"]=="Italy") & (df["Province/State"].isnull())][df.columns[4:]].T
t1 = t1.rename(columns={t1.columns[0]:"confirmed"})
t1.reset_index() #    
t1["Country/Region"] = "Italy" #    
t1.tail() # ,  


, , , :


def country_df(country):
    cdf = df[(df["Country/Region"]==country) & (df["Province/State"].isnull())][df.columns[4:]].T
    cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"})
    delta = (cdf.shift(-1) - cdf).rename(columns={"confirmed": "confirmed per day"})
    delta.reset_index()
    delta["Country/Region"] = country
    delta_p = ((cdf.shift(-1) - cdf) / cdf.shift(-1)).rename(columns={"confirmed": "confirmed per day %"})
    delta_p.reset_index()
    delta_p["Country/Region"] = country
    cdf.reset_index()
    cdf["Country/Region"] = country
    return (cdf, delta, delta_p)

10 , 3 , , , :


#  10     -   
top10 = df[df["Province/State"].isnull()].sort_values(by=[df.columns[-1]], ascending=False)[["Country/Region"]].head(10)

# ,       
top = []
top_delta = []
top_delta_p = []
for c in top10["Country/Region"].tolist():
    (x, y, z) = country_df(c)
    top.append(x)
    top_delta.append(y)
    top_delta_p.append(z)

test = pd.concat(top) #     
#  
px.line(test, x=test.index, y="confirmed", color='Country/Region').show()


, , , :


  • 10 .
  • 10 .
  • 10 .
  • 30 .
  • , , .
  • , .

:



frame = create_frame("covid19/speed_by_country_all")

top10df = pd.concat(top)
fig = px.line(top10df, x=top10df.index, y="confirmed", color='Country/Region')
frame.commit(fig, "Confirmed cases in top 10 countries", {"Country": "Top 10", "Chart": "All cases"})

top10df_delta = pd.concat(top_delta)
fig = px.line(top10df_delta, x=top10df_delta.index, y="confirmed per day", color='Country/Region')
frame.commit(fig, "New confirmed cases in top 10 countries", {"Country": "Top 10", "Chart": "New cases"})

top10df_delta_p = pd.concat(top_delta_p)
fig = px.line(top10df_delta_p, x=top10df_delta_p.index, y="confirmed per day %", color='Country/Region')
frame.commit(fig, "New confirmed cases in top 10 countries in %", {"Country": "Top 10", "Chart": "New cases (%)"})

for c in countries["Country/Region"].tolist():
    print(c)
    (fig1, fig2, fig3) = plots_by_country(c)
    frame.commit(fig1, f"Confirmed cases in {c}", {"Country": c, "Chart": "All cases"})
    frame.commit(fig2, f"New confirmed cases in {c}", {"Country": c, "Chart": "New cases"})
    frame.commit(fig3, f"New confirmed cases in {c} in %", {"Country": c, "Chart": "New cases (%)"})

frame.push()

: https://dstack.ai/cheptsov/covid19/speed_by_country_all.


. , , , . GitHub ( , , , – ) .


, :



All Articles