包含了数据的创建、缺失值判断处理、日期处理、数据排序以及选取,并从三个方面进行了数据的入选观测,包括了SQL语句,需要先下载包。
类型转换函数没有进行测试:
 其中涉及转换的有as.numeric() as.character() as.vector() as.matrix() as.data.frame() as.factor() as.logical() 等。
> manager <- c(1,2,3,4,5)
> date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")
> country <- c("US","US","UK","UK","UK")
> gender <- c("M","F","F","M","F")
> age <- c(32,42,25,39,99)
> q1 <- c(5,3,3,3,2)
> q2 <- c(4,5,5,3,2)
> q3 <- c(5,2,5,4,1)
> q4 <- c(5,5,5,NA,2)
> q5 <- c(5,5,2,NA,1)
> leadership <- data.frame(manager,date,country,gender,age,
+                          q1,q2,q3,q4,q5,stringsAsFactors = FALSE)
> leadership
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  42  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2
4       4 10/12/08      UK      M  39  3  3  4 NA NA
5       5   5/1/09      UK      F  99  2  2  1  2  1
> #对测试数据的缺失值判断
> is.na(leadership[,6:10])
        q1    q2    q3    q4    q5
[1,] FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE  TRUE  TRUE
[5,] FALSE FALSE FALSE FALSE FALSE
> #去除缺失值
> newdata <- na.omit(leadership)
> newdata
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  42  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2
5       5   5/1/09      UK      F  99  2  2  1  2  1
> #数据排序
> newdata1 <- leadership[order(leadership$age),]
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:
    age, country, date, gender, manager, q1, q2, q3, q4, q5
> #按age排序
> newdata1
  manager     date country gender age q1 q2 q3 q4 q5
3       3  10/1/08      UK      F  25  3  5  5  5  2
1       1 10/24/08      US      M  32  5  4  5  5  5
4       4 10/12/08      UK      M  39  3  3  4 NA NA
2       2 10/28/08      US      F  42  3  5  2  5  5
5       5   5/1/09      UK      F  99  2  2  1  2  1
> newdata2 <- leadership[order(gender,age),]
> detach(leadership)
> #按性别分组,年龄升序排序
> newdata2
  manager     date country gender age q1 q2 q3 q4 q5
3       3  10/1/08      UK      F  25  3  5  5  5  2
2       2 10/28/08      US      F  42  3  5  2  5  5
5       5   5/1/09      UK      F  99  2  2  1  2  1
1       1 10/24/08      US      M  32  5  4  5  5  5
4       4 10/12/08      UK      M  39  3  3  4 NA NA
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:
    age, country, date, gender, manager, q1, q2, q3, q4, q5
> newdata3 <- leadership[order(gender,-age),]
> detach(leadership)
> #按性别分组,降序年龄
> newdata3
  manager     date country gender age q1 q2 q3 q4 q5
5       5   5/1/09      UK      F  99  2  2  1  2  1
2       2 10/28/08      US      F  42  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2
4       4 10/12/08      UK      M  39  3  3  4 NA NA
1       1 10/24/08      US      M  32  5  4  5  5  5
> #入选观测,选择1-3行
> newdata4 <- leadership[1:3,]
> #逻辑比较
> newdata5 <- leadership[leadership$gender=="M" & leadership$age >30]
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:
    age, country, date, gender, manager, q1, q2, q3, q4, q5
> newdata5
  manager gender q1 q4
1       1      M  5  5
2       2      F  3  5
3       3      F  3  5
4       4      M  3 NA
5       5      F  2  2
> #入选观测性别为男,年龄大于30
> newdata6 <- leadership[gender=='M' & age > 30]
> detach(leadership)
> newdata6
  manager gender q1 q4
1       1      M  5  5
2       2      F  3  5
3       3      F  3  5
4       4      M  3 NA
5       5      F  2  2
> #subset()函数实现选择观测
> 
> newdata7 <- subset(leadership,age >= 35 | age<24,select = c(q1,q2,q3,q4))
> newdata7
  q1 q2 q3 q4
2  3  5  2  5
4  3  3  4 NA
5  2  2  1  2
> newdata8 <- subset(leadership,gender=="M" & age>25,select = gender:q4)
> newdata8
  gender age q1 q2 q3 q4
1      M  32  5  4  5  5
4      M  39  3  3  4 NA
> #直接使用sql语句查询
> newdf <- sqldf("select * from leadership where country='US'",row.names = TRUE)
> newdf
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  42  3  5  2  5  5
> #日期格式的观测
> 
> #默认方式
> mydates <- as.Date(c("2021-06-22","2022-02-13"))
> mydates
[1] "2021-06-22" "2022-02-13"
> #将默认格式转换为对应日期
> 
> strDates <- c("01/05/2022","08/16/2021")
> dates <- as.Date(strDates,"%m/%d/%Y")
> dates
[1] "2022-01-05" "2021-08-16"
> #将原本数据框里的日期格式转为字符型的
> 
> myformat <- "%m/%d/%y"
> leadership$date <- as.Date(leadership$date,myformat)
> leadership$date
[1] "2008-10-24" "2008-10-28" "2008-10-01" "2008-10-12" "2009-05-01"
> #变量的创建
> mydata <- data.frame(x1=c(2,2,6,4),x2=c(3,4,2,8))
> mydata$sumx <- mydata$x1+mydata$x2
> mydata$meanx <- (mydata$x1 + mydata$x2)/2
> attach(mydata)
> mydata
  x1 x2 sumx meanx
1  2  3    5   2.5
2  2  4    6   3.0
3  6  2    8   4.0
4  4  8   12   6.0
> mydata$sumx <- x1+x2
> mydata$meanx <- (x1+x2)/2
> detach(mydata)
> mydata
  x1 x2 sumx meanx
1  2  3    5   2.5
2  2  4    6   3.0
3  6  2    8   4.0
4  4  8   12   6.0
> mydata <- transform(mydata,sumx=x1+x2,meanx=(x1+x2)/2)
> mydata
  x1 x2 sumx meanx
1  2  3    5   2.5
2  2  4    6   3.0
3  6  2    8   4.0
4  4  8   12   6.0










