COVID-19 propagation rate analysis and publication of the results on dstack.ai

Translator's Preface



Hello everyone, I haven’t written anything to Habr for a long time, but then there was a reason. I have worked in the field of data analysis for quite some years and the most important thing that I understood during this time is that there are very few tools in the data analysis, a wide variety of tools. I was worried about several things, one of which was the difficulty the data analysis specialist is faced with when trying to share the results of his work with a manager or even a colleague in the shop floor. Usually, any available tools, email, IM, dropbox, etc. are used here. Andrei and another friend of ours decided to try to do something meaningful in this area and today I want to tell you about what we did. In the situation in which we all found ourselves due to the COVID-19 virus, the problem of publishing and discussing research results has become, perhaps, even more relevant.than ever before.


, , dstack.ai , . , , , , - , .
, vitaly at dstack.ai.



COVID-19 β€” , SARS-CoV-2 (2019-nCoV). β€” , /, .



, .

, , .

: |

COVID-19


, . dstack.ai, , , .


COVID-19 . COVID-19 , , .


, , , , Python , COVID-19 , , .
, ...


  • Python, pandas plot.ly . , data science Python .
  • COVID-19.
  • Python dstack.ai API , .

, . , , andrey at dstack.ai.


, Python Jupyter , Jupyter, Python.
, β€” . , : pandas , plot.ly .


pandas – de facto Python, (, , Matplotlib and Bokeh). plotly.express – , plotly , ( plotly – ).
dstack.ai, dstack Python ( conda – ).


import pandas as pd
import plotly.express as px
from dstack import create_frame

, – , , . , , , , pandas. – , . COVID-19, .


, , , COVID-19 /, . , URL pandas, :


url = "https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_19-covid-Confirmed.csv&filename=time_series_2019-ncov-Confirmed.csv"
df = pd.read_csv(url) #    URL
df.head() #   5 ,     

downloaded data


( , , , , , – ).


, – COVID-19 . , , pandas API : – ( , , – ):


#   (2-  )    
cols = [df.columns[1]] + list(df.columns[-2:]) 
last_2_days = df[df["Province/State"].isnull()][cols].copy() # 
last_2_days # ,    


, , , . , : , :


d1 = last_2_days.columns[-1]  #  
d2 = last_2_days.columns[-2]  #  
last_2_days["delta"] = last_2_days[d1] - last_2_days[d2]
last_2_days["delta%"] = last_2_days["delta"] / last_2_days[d2]
last_2_days


, . , dstack. API, pandas , plotly ( ).


dstack – , , : , , .


dstack.ai , .


, pandas () plotly , dstack, , . URL https://dstack.ai/<user>/<stack>. . – , . – . (attachments). ( , , , - , – ).


. , . :


  1. covid19/speed ( , – ). , : https://dstack.ai/<user>/covid19/speed.

, dstack.


dstack config --token <token> --user <user>

dstack , .dstack/config.yaml (.. , – ). , docs.dstack.ai.


  1. (.. – ), , , . (commited) .


  2. (push) :



min_cases = 50
#    
top_speed_frame = create_frame("covid19/speed")
# 
sort_by_cols = ["delta", "delta%"]
for col in sort_by_cols:
    top = last_2_days[last_2_days[last_2_days.columns[1]] > min_cases].
                 sort_values(by=[col], ascending=False).head(50)
    #    
    top_speed_frame.commit(top, f"Top 50 countries with the " \
                                f"fastest growing number of confirmed " \
                                f"Covid-19 cases (at least {min_cases})", 
                                {"Sort by": col})

top_speed_frame.push()

: https://dstack.ai/cheptsov/covid19/speed.



- , , . , . .


, , – , :


#  
cdf = df[(df["Country/Region"]=="Italy") & (df["Province/State"].isnull())][df.columns[4:]].T
#    
cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"}) 

, , . , x , y :


fig = px.line(cdf, x=cdf.index, y="confirmed")
fig.show()


- . ?


, , .
. pandas API:


delta = (cdf.shift(-1) - cdf)
delta.tail() # ,     ,  


, , , , , ( ):


fig = px.line(delta, x=delta.index, y="confirmed")
fig.show()


, , pandas. – , , .


3 (figures) plotly: , , :


def plots_by_country(country):
    cdf = df[(df["Country/Region"]==country) & (df["Province/State"].isnull())][df.columns[4:]].T
    cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"})
    cfig = px.line(cdf, x=cdf.index, y="confirmed")
    delta = (cdf.shift(-1) - cdf).rename(columns={"confirmed": "confirmed per day"})
    cdfig = px.line(delta, x=cdf.index, y="confirmed per day")
    delta_p = ((cdf.shift(-1) - cdf) / cdf.shift(-1)).rename(columns={"confirmed": "confirmed per day %"})
    cdpfig = px.line(delta_p, x=cdf.index, y="confirmed per day %")
    return (cfig, cdfig, cdpfig)

, , :


(fig1, fig2, fig3) = plots_by_country("Austria")
fig1.show()
fig2.show()
fig3.show()




30 . , (dashboards) . :


#  30       
countries = df[df["Province/State"].isnull()].sort_values(by=[df.columns[-1]], ascending=False)[["Country/Region"]].head(30)

#           
frame = create_frame("covid19/speed_by_country")
for c in countries["Country/Region"].tolist():
    print(c)
    (fig1, fig2, fig3) = plots_by_country(c)
    frame.commit(fig1, f"Confirmed cases in {c}", {"Country": c, "Chart": "All cases"})
    frame.commit(fig2, f"New confirmed cases in {c}", {"Country": c, "Chart": "New cases"})
    frame.commit(fig3, f"New confirmed cases in {c} in %", {"Country": c, "Chart": "New cases (%)"})

frame.push()

https://dstack.ai/cheptsov/covid19/speed_by_country.



, , , , , , ( , top 10 – ) .


: ( – ):


t1 = df[(df["Country/Region"]=="Italy") & (df["Province/State"].isnull())][df.columns[4:]].T
t1 = t1.rename(columns={t1.columns[0]:"confirmed"})
t1.reset_index() #    
t1["Country/Region"] = "Italy" #    
t1.tail() # ,  


, , , :


def country_df(country):
    cdf = df[(df["Country/Region"]==country) & (df["Province/State"].isnull())][df.columns[4:]].T
    cdf = cdf.rename(columns={cdf.columns[0]:"confirmed"})
    delta = (cdf.shift(-1) - cdf).rename(columns={"confirmed": "confirmed per day"})
    delta.reset_index()
    delta["Country/Region"] = country
    delta_p = ((cdf.shift(-1) - cdf) / cdf.shift(-1)).rename(columns={"confirmed": "confirmed per day %"})
    delta_p.reset_index()
    delta_p["Country/Region"] = country
    cdf.reset_index()
    cdf["Country/Region"] = country
    return (cdf, delta, delta_p)

10 , 3 , , , :


#  10     -   
top10 = df[df["Province/State"].isnull()].sort_values(by=[df.columns[-1]], ascending=False)[["Country/Region"]].head(10)

# ,       
top = []
top_delta = []
top_delta_p = []
for c in top10["Country/Region"].tolist():
    (x, y, z) = country_df(c)
    top.append(x)
    top_delta.append(y)
    top_delta_p.append(z)

test = pd.concat(top) #     
#  
px.line(test, x=test.index, y="confirmed", color='Country/Region').show()


, , , :


  • 10 .
  • 10 .
  • 10 .
  • 30 .
  • , , .
  • , .

:



frame = create_frame("covid19/speed_by_country_all")

top10df = pd.concat(top)
fig = px.line(top10df, x=top10df.index, y="confirmed", color='Country/Region')
frame.commit(fig, "Confirmed cases in top 10 countries", {"Country": "Top 10", "Chart": "All cases"})

top10df_delta = pd.concat(top_delta)
fig = px.line(top10df_delta, x=top10df_delta.index, y="confirmed per day", color='Country/Region')
frame.commit(fig, "New confirmed cases in top 10 countries", {"Country": "Top 10", "Chart": "New cases"})

top10df_delta_p = pd.concat(top_delta_p)
fig = px.line(top10df_delta_p, x=top10df_delta_p.index, y="confirmed per day %", color='Country/Region')
frame.commit(fig, "New confirmed cases in top 10 countries in %", {"Country": "Top 10", "Chart": "New cases (%)"})

for c in countries["Country/Region"].tolist():
    print(c)
    (fig1, fig2, fig3) = plots_by_country(c)
    frame.commit(fig1, f"Confirmed cases in {c}", {"Country": c, "Chart": "All cases"})
    frame.commit(fig2, f"New confirmed cases in {c}", {"Country": c, "Chart": "New cases"})
    frame.commit(fig3, f"New confirmed cases in {c} in %", {"Country": c, "Chart": "New cases (%)"})

frame.push()

: https://dstack.ai/cheptsov/covid19/speed_by_country_all.


. , , , . GitHub ( , , , – ) .


, :



All Articles