5.2 Manipulating Data Frames
The dplyr
package from the tidyverse
introduces functions that perform some of the most common operations when working with data frames and uses names for these functions that are relatively easy to remember.
5.2.1 Change Row Names
5.2.2 Adding a column with mutate()
5.2.3 Subsetting with filter()
library(dplyr)
# add new column with temperature conversion from celsius to fahrenheit
df <- filter(df,
timestamp >= "2020-01-01 00:00:00",
timestamp <= "2020-12-31 23:45:00")
df.high <- filter(df, temp_c > 30)
Note: Whether you put the whole code on one line or split it after a comma does not have an effect on the computation, it is only more readable when the lines aren’t too wide.
5.2.4 Add/remove columns with select()
library(dplyr)
# Based on the upper example we remove the celsius column after calculation
df <- select(df, -temp_c)
# Create new data frame
df.new <- select(df, timestamp, temp_f)
# use select() in a so called dplyr pipe
df <- df %>%
dplyr::mutate(df, temp_f = temp_c * 1.8 + 32) %>%
select(-temp_c)
5.2.5 The pipe %>%
With the package dplyr
we can perform a series of operations, for example select and then filter, by sending the results of one function to another using what is called the pipe operator: %>%
.
Details can be found in Introduction to Data Science - Chapter 4.5
5.2.6 Wide to Long
library(tidyr)
# load test data set
df <- read.csv("https://github.com/hslu-ige-laes/edar/raw/master/sampleData/flatTempHum.csv",
stringsAsFactors=FALSE,
sep =";")
# create a copy of the dataframe and print the header and the first five line
head(df, 5)
## time FlatA_Hum FlatA_Temp FlatB_Hum FlatB_Temp FlatC_Hum
## 1 2018-10-03 00:00:00 53.0 24.43 38.8 22.40 44.0
## 2 2018-10-03 01:00:00 53.0 24.40 38.8 22.40 44.0
## 3 2018-10-03 02:00:00 53.0 24.40 39.3 22.40 44.7
## 4 2018-10-03 03:00:00 53.0 24.40 40.3 22.40 45.0
## 5 2018-10-03 04:00:00 53.3 24.40 41.0 22.37 45.2
## FlatC_Temp FlatD_Hum FlatD_Temp
## 1 24.5 49.0 24.43
## 2 24.5 49.0 24.40
## 3 24.5 48.3 24.38
## 4 24.5 48.0 24.33
## 5 24.5 47.7 24.30
# convert wide to long format
df.long <- as.data.frame(tidyr::pivot_longer(df,
cols = -time,
names_to = "sensor",
values_to = "value",
values_drop_na = TRUE))
# long format
head(df.long, 16)
## time sensor value
## 1 2018-10-03 00:00:00 FlatA_Hum 53.00
## 2 2018-10-03 00:00:00 FlatA_Temp 24.43
## 3 2018-10-03 00:00:00 FlatB_Hum 38.80
## 4 2018-10-03 00:00:00 FlatB_Temp 22.40
## 5 2018-10-03 00:00:00 FlatC_Hum 44.00
## 6 2018-10-03 00:00:00 FlatC_Temp 24.50
## 7 2018-10-03 00:00:00 FlatD_Hum 49.00
## 8 2018-10-03 00:00:00 FlatD_Temp 24.43
## 9 2018-10-03 01:00:00 FlatA_Hum 53.00
## 10 2018-10-03 01:00:00 FlatA_Temp 24.40
## 11 2018-10-03 01:00:00 FlatB_Hum 38.80
## 12 2018-10-03 01:00:00 FlatB_Temp 22.40
## 13 2018-10-03 01:00:00 FlatC_Hum 44.00
## 14 2018-10-03 01:00:00 FlatC_Temp 24.50
## 15 2018-10-03 01:00:00 FlatD_Hum 49.00
## 16 2018-10-03 01:00:00 FlatD_Temp 24.40
5.2.7 Long to Wide
## time sensor value
## 1 2018-10-03 00:00:00 FlatA_Hum 53.00
## 2 2018-10-03 00:00:00 FlatA_Temp 24.43
## 3 2018-10-03 00:00:00 FlatB_Hum 38.80
## 4 2018-10-03 00:00:00 FlatB_Temp 22.40
## 5 2018-10-03 00:00:00 FlatC_Hum 44.00
## 6 2018-10-03 00:00:00 FlatC_Temp 24.50
# convert long table into wide table
df.wide <- as.data.frame(tidyr::pivot_wider(df.long,
names_from = "sensor",
values_from = "value")
)
# wide format
head(df.wide)
## time FlatA_Hum FlatA_Temp FlatB_Hum FlatB_Temp FlatC_Hum
## 1 2018-10-03 00:00:00 53.0 24.43 38.8 22.40 44.0
## 2 2018-10-03 01:00:00 53.0 24.40 38.8 22.40 44.0
## 3 2018-10-03 02:00:00 53.0 24.40 39.3 22.40 44.7
## 4 2018-10-03 03:00:00 53.0 24.40 40.3 22.40 45.0
## 5 2018-10-03 04:00:00 53.3 24.40 41.0 22.37 45.2
## 6 2018-10-03 05:00:00 53.7 24.40 41.2 22.30 47.2
## FlatC_Temp FlatD_Hum FlatD_Temp
## 1 24.50 49.0 24.43
## 2 24.50 49.0 24.40
## 3 24.50 48.3 24.38
## 4 24.50 48.0 24.33
## 5 24.50 47.7 24.30
## 6 24.57 47.2 24.30
5.2.8 Merge two Dataframes
library(dplyr)
library(lubridate)
# read file one and parse dates
dfOutsideTemp <- read.csv("https://github.com/hslu-ige-laes/edar/raw/master/sampleData/centralOutsideTemp.csv",
stringsAsFactors=FALSE,
sep =";")
dfOutsideTemp$time <- parse_date_time(dfOutsideTemp$time,
orders = "YmdHMS",
tz = "Europe/Zurich")
# read file two and parse dates
dfFlatTempHum <- read.csv("https://github.com/hslu-ige-laes/edar/raw/master/sampleData/flatTempHum.csv",
stringsAsFactors=FALSE, sep =";")
dfFlatTempHum$time <- parse_date_time(dfFlatTempHum$time,
order = "YmdHMS",
tz = "Europe/Zurich")
# merge the two files into a new data frame and keep only rows where all values are available
df <- merge(dfOutsideTemp, dfFlatTempHum, by = "time") %>% na.omit()
head(df)
## time centralOutsideTemp FlatA_Hum FlatA_Temp FlatB_Hum
## 1 2018-10-03 00:00:00 11.80 53.0 24.43 38.8
## 2 2018-10-03 01:00:00 11.25 53.0 24.40 38.8
## 3 2018-10-03 02:00:00 11.45 53.0 24.40 39.3
## 4 2018-10-03 03:00:00 11.40 53.0 24.40 40.3
## 5 2018-10-03 04:00:00 11.10 53.3 24.40 41.0
## 6 2018-10-03 05:00:00 11.05 53.7 24.40 41.2
## FlatB_Temp FlatC_Hum FlatC_Temp FlatD_Hum FlatD_Temp
## 1 22.40 44.0 24.50 49.0 24.43
## 2 22.40 44.0 24.50 49.0 24.40
## 3 22.40 44.7 24.50 48.3 24.38
## 4 22.40 45.0 24.50 48.0 24.33
## 5 22.37 45.2 24.50 47.7 24.30
## 6 22.30 47.2 24.57 47.2 24.30