读取及查看数据库基本情况
mydf <- read.csv(file = path2csv,stringsAsFactors = FALSE) # 读取数据
dim(mydf) #查看维度
[1] 225468 11
head(mydf) #查看数据库头
X date time size r_version r_arch
1 1 2014-07-08 00:54:41 80589 3.1.0 x86_64
2 2 2014-07-08 00:59:53 321767 3.1.0 x86_64
3 3 2014-07-08 00:47:13 748063 3.1.0 x86_64
4 4 2014-07-08 00:48:05 606104 3.1.0 x86_64
5 5 2014-07-08 00:46:50 79825 3.0.2 x86_64
6 6 2014-07-08 00:48:04 77681 3.1.0 x86_64
X date time size r_version r_arch
1 1 2014-07-08 00:54:41 80589 3.1.0 x86_64
2 2 2014-07-08 00:59:53 321767 3.1.0 x86_64
3 3 2014-07-08 00:47:13 748063 3.1.0 x86_64
4 4 2014-07-08 00:48:05 606104 3.1.0 x86_64
5 5 2014-07-08 00:46:50 79825 3.0.2 x86_64
6 6 2014-07-08 00:48:04 77681 3.1.0 x86_64
载入dplyr及查看信息
library(dplyr)
packageVersion("dplyr")
[1] ‘1.0.7’
转换表格
cran <- tbl_df(mydf)
cran
# A tibble: 225,468 × 11
X date time size r_version r_arch r_os package
<int> <chr> <chr> <int> <chr> <chr> <chr> <chr>
1 1 2014… 00:5… 8.06e4 3.1.0 x86_64 ming… htmlto…
2 2 2014… 00:5… 3.22e5 3.1.0 x86_64 ming… tseries
3 3 2014… 00:4… 7.48e5 3.1.0 x86_64 linu… party
4 4 2014… 00:4… 6.06e5 3.1.0 x86_64 linu… Hmisc
5 5 2014… 00:4… 7.98e4 3.0.2 x86_64 linu… digest
6 6 2014… 00:4… 7.77e4 3.1.0 x86_64 linu… random…
7 7 2014… 00:4… 3.94e5 3.1.0 x86_64 linu… plyr
8 8 2014… 00:4… 2.82e4 3.0.2 x86_64 linu… whisker
9 9 2014… 00:5… 5.93e3 NA NA NA Rcpp
10 10 2014… 00:1… 2.21e6 3.0.2 x86_64 linu… hfligh…
select()
select(cran,ip_id,package,country)
# A tibble: 225,468 × 3
ip_id package country
<int> <chr> <chr>
1 1 htmltools US
2 2 tseries US
3 3 party US
4 3 Hmisc US
5 4 digest CA
6 3 randomForest US
7 3 plyr US
8 5 whisker US
9 6 Rcpp CN
10 7 hflights US
# … with 225,458 more rows
select(cran, r_arch:country)
# A tibble: 225,468 × 5
r_arch r_os package version country
<chr> <chr> <chr> <chr> <chr>
1 x86_64 mingw32 htmltools 0.2.4 US
2 x86_64 mingw32 tseries 0.10-32 US
3 x86_64 linux-gnu party 1.0-15 US
4 x86_64 linux-gnu Hmisc 3.14-4 US
5 x86_64 linux-gnu digest 0.6.4 CA
6 x86_64 linux-gnu randomForest 4.6-7 US
7 x86_64 linux-gnu plyr 1.8.1 US
8 x86_64 linux-gnu whisker 0.3-2 US
9 NA NA Rcpp 0.10.4 CN
10 x86_64 linux-gnu hflights 0.1 US
# … with 225,458 more rows
select(cran, -time)
# A tibble: 225,468 × 10
X date size r_version r_arch r_os package
<int> <chr> <int> <chr> <chr> <chr> <chr>
1 1 2014-07-08 80589 3.1.0 x86_64 ming… htmlto…
2 2 2014-07-08 321767 3.1.0 x86_64 ming… tseries
3 3 2014-07-08 748063 3.1.0 x86_64 linu… party
4 4 2014-07-08 606104 3.1.0 x86_64 linu… Hmisc
5 5 2014-07-08 79825 3.0.2 x86_64 linu… digest
6 6 2014-07-08 77681 3.1.0 x86_64 linu… random…
7 7 2014-07-08 393754 3.1.0 x86_64 linu… plyr
8 8 2014-07-08 28216 3.0.2 x86_64 linu… whisker
9 9 2014-07-08 5928 NA NA NA Rcpp
10 10 2014-07-08 2206029 3.0.2 x86_64 linu… hfligh…
# … with 225,458 more rows, and 3 more variables:
# version <chr>, country <chr>, ip_id <int>
reverse
-(5:20)
[1] -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17
[14] -18 -19 -20
filter()
filter(cran, package == "swirl")
# A tibble: 820 × 11
X date time size r_version r_arch r_os package
<int> <chr> <chr> <int> <chr> <chr> <chr> <chr>
1 27 2014… 00:1… 105350 3.0.2 x86_64 ming… swirl
2 156 2014… 00:2… 41261 3.1.0 x86_64 linu… swirl
3 358 2014… 00:1… 105335 2.15.2 x86_64 ming… swirl
4 593 2014… 00:5… 105465 3.1.0 x86_64 darw… swirl
5 831 2014… 00:5… 105335 3.0.3 x86_64 ming… swirl
6 997 2014… 00:3… 41261 3.1.0 x86_64 ming… swirl
7 1023 2014… 00:3… 106393 3.1.0 x86_64 ming… swirl
8 1144 2014… 00:0… 106534 3.0.2 x86_64 linu… swirl
9 1402 2014… 00:4… 41261 3.1.0 i386 ming… swirl
10 1424 2014… 00:4… 106393 3.1.0 x86_64 linu… swirl
# … with 810 more rows, and 3 more variables:
# version <chr>, country <chr>, ip_id <int>
filter(cran, r_version == "3.1.1", country == "US")
# A tibble: 1,588 × 11
X date time size r_version r_arch r_os package
<int> <chr> <chr> <int> <chr> <chr> <chr> <chr>
1 2216 2014… 00:4… 3.85e5 3.1.1 x86_64 darw… colors…
2 17332 2014… 03:3… 1.97e5 3.1.1 x86_64 darw… httr
3 17465 2014… 03:2… 2.33e4 3.1.1 x86_64 darw… snow
4 18844 2014… 03:5… 1.91e5 3.1.1 x86_64 darw… maxLik
5 30182 2014… 04:1… 7.77e4 3.1.1 i386 ming… random…
6 30193 2014… 04:0… 2.35e6 3.1.1 i386 ming… ggplot2
7 30195 2014… 04:0… 2.99e5 3.1.1 i386 ming… fExtre…
8 30217 2014… 04:3… 5.68e5 3.1.1 i386 ming… rJava
9 30245 2014… 04:1… 5.27e5 3.1.1 i386 ming… LPCM
10 30354 2014… 04:3… 1.76e6 3.1.1 i386 ming… mgcv
# … with 1,578 more rows, and 3 more variables:
# version <chr>, country <chr>, ip_id <int>
filter(cran, r_version <= "3.0.2", country == "IN")
# A tibble: 4,139 × 11
X date time size r_version r_arch r_os package
<int> <chr> <chr> <int> <chr> <chr> <chr> <chr>
1 348 2014… 00:4… 1.02e7 3.0.0 x86_64 ming… BH
2 9990 2014… 02:1… 3.97e5 3.0.2 x86_64 linu… equate…
3 9991 2014… 02:1… 1.19e5 3.0.2 x86_64 linu… ggdend…
4 9992 2014… 02:1… 8.18e4 3.0.2 x86_64 linu… dfcrm
5 10022 2014… 02:1… 1.56e6 2.15.0 x86_64 ming… RcppAr…
6 10023 2014… 02:1… 1.18e6 2.15.1 i686 linu… foreca…
7 10189 2014… 02:3… 9.09e5 3.0.2 x86_64 linu… editru…
8 10199 2014… 02:3… 1.78e5 3.0.2 x86_64 linu… energy
9 10200 2014… 02:3… 5.18e4 3.0.2 x86_64 linu… ENmisc
10 10201 2014… 02:3… 6.52e4 3.0.2 x86_64 linu… entropy
# … with 4,129 more rows, and 3 more variables:
# version <chr>, country <chr>, ip_id <int>
filter(cran,size > 100500, r_os == "linux-gnu")
# A tibble: 33,683 × 11
X date time size r_version r_arch r_os package
<int> <chr> <chr> <int> <chr> <chr> <chr> <chr>
1 3 2014… 00:4… 7.48e5 3.1.0 x86_64 linu… party
2 4 2014… 00:4… 6.06e5 3.1.0 x86_64 linu… Hmisc
3 7 2014… 00:4… 3.94e5 3.1.0 x86_64 linu… plyr
4 10 2014… 00:1… 2.21e6 3.0.2 x86_64 linu… hfligh…
5 11 2014… 00:1… 5.27e5 3.0.2 x86_64 linu… LPCM
6 12 2014… 00:1… 2.35e6 2.14.1 x86_64 linu… ggplot2
7 14 2014… 00:1… 3.10e6 3.0.2 x86_64 linu… Rcpp
8 15 2014… 00:1… 5.68e5 3.1.0 x86_64 linu… rJava
9 16 2014… 00:1… 1.60e6 3.1.0 x86_64 linu… RSQLite
10 18 2014… 00:2… 1.87e5 3.1.0 x86_64 linu… ipred
# … with 33,673 more rows, and 3 more variables:
# version <chr>, country <chr>, ip_id <int>
filter(cran,!is.na(r_version))
# A tibble: 207,205 × 11
X date time size r_version r_arch r_os package
<int> <chr> <chr> <int> <chr> <chr> <chr> <chr>
1 1 2014… 00:5… 8.06e4 3.1.0 x86_64 ming… htmlto…
2 2 2014… 00:5… 3.22e5 3.1.0 x86_64 ming… tseries
3 3 2014… 00:4… 7.48e5 3.1.0 x86_64 linu… party
4 4 2014… 00:4… 6.06e5 3.1.0 x86_64 linu… Hmisc
5 5 2014… 00:4… 7.98e4 3.0.2 x86_64 linu… digest
6 6 2014… 00:4… 7.77e4 3.1.0 x86_64 linu… random…
7 7 2014… 00:4… 3.94e5 3.1.0 x86_64 linu… plyr
8 8 2014… 00:4… 2.82e4 3.0.2 x86_64 linu… whisker
9 10 2014… 00:1… 2.21e6 3.0.2 x86_64 linu… hfligh…
10 11 2014… 00:1… 5.27e5 3.0.2 x86_64 linu… LPCM
# … with 207,195 more rows, and 3 more variables:
# version <chr>, country <chr>, ip_id <int>
arrange()
cran2 <- select(cran,size:ip_id)
# A tibble: 225,468 × 8
size r_version r_arch r_os package version country ip_id
<int> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 80589 3.1.0 x86_64 mingw32 htmltools 0.2.4 US 1
2 180562 3.0.2 x86_64 mingw32 yaml 2.1.13 US 1
3 190120 3.1.0 i386 mingw32 babel 0.2-6 US 1
4 321767 3.1.0 x86_64 mingw32 tseries 0.10-32 US 2
5 52281 3.0.3 x86_64 darwin10.8.0 quadprog 1.5-5 US 2
6 876702 3.1.0 x86_64 linux-gnu zoo 1.7-11 US 2
7 321764 3.0.2 x86_64 linux-gnu tseries 0.10-32 US 2
8 876702 3.1.0 x86_64 linux-gnu zoo 1.7-11 US 2
9 321768 3.1.0 x86_64 mingw32 tseries 0.10-32 US 2
10 784093 3.1.0 x86_64 linux-gnu strucchange 1.5-0 US 2
# … with 225,458 more rows
arrange(cran2,ip_id)
# A tibble: 225,468 × 8
size r_version r_arch r_os package version country ip_id
<int> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 80589 3.1.0 x86_64 mingw32 htmltools 0.2.4 US 1
2 180562 3.0.2 x86_64 mingw32 yaml 2.1.13 US 1
3 190120 3.1.0 i386 mingw32 babel 0.2-6 US 1
4 321767 3.1.0 x86_64 mingw32 tseries 0.10-32 US 2
5 52281 3.0.3 x86_64 darwin10.8.0 quadprog 1.5-5 US 2
6 876702 3.1.0 x86_64 linux-gnu zoo 1.7-11 US 2
7 321764 3.0.2 x86_64 linux-gnu tseries 0.10-32 US 2
8 876702 3.1.0 x86_64 linux-gnu zoo 1.7-11 US 2
9 321768 3.1.0 x86_64 mingw32 tseries 0.10-32 US 2
10 784093 3.1.0 x86_64 linux-gnu strucchange 1.5-0 US 2
# … with 225,458 more rows
arrange(cran2, desc(ip_id))
# A tibble: 225,468 × 8
size r_version r_arch r_os package version country ip_id
<int> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 5933 NA NA NA CPE 1.4.2 CN 13859
2 569241 3.1.0 x86_64 mingw32 multcompView 0.1-5 US 13858
3 228444 3.1.0 x86_64 mingw32 tourr 0.5.3 NZ 13857
4 308962 3.1.0 x86_64 darwin13.1.0 ctv 0.7-9 CN 13856
5 950964 3.0.3 i386 mingw32 knitr 1.6 CA 13855
6 80185 3.0.3 i386 mingw32 htmltools 0.2.4 CA 13855
7 1431750 3.0.3 i386 mingw32 shiny 0.10.0 CA 13855
8 2189695 3.1.0 x86_64 mingw32 RMySQL 0.9-3 US 13854
9 4818024 3.1.0 i386 mingw32 igraph 0.7.1 US 13853
10 197495 3.1.0 x86_64 mingw32 coda 0.16-1 US 13852
# … with 225,458 more rows
mutate()
mutate(cran3, size_mb = size / 2^20)
# A tibble: 225,468 × 4
ip_id package size size_mb
<int> <chr> <int> <dbl>
1 1 htmltools 80589 0.0769
2 2 tseries 321767 0.307
3 3 party 748063 0.713
4 3 Hmisc 606104 0.578
5 4 digest 79825 0.0761
6 3 randomForest 77681 0.0741
7 3 plyr 393754 0.376
8 5 whisker 28216 0.0269
9 6 Rcpp 5928 0.00565
10 7 hflights 2206029 2.10
# … with 225,458 more rows
mutate(cran3,size_mb = size / 2^20,size_gb = size_mb / 2^10)
# A tibble: 225,468 × 5
ip_id package size size_mb size_gb
<int> <chr> <int> <dbl> <dbl>
1 1 htmltools 80589 0.0769 0.0000751
2 2 tseries 321767 0.307 0.000300
3 3 party 748063 0.713 0.000697
4 3 Hmisc 606104 0.578 0.000564
5 4 digest 79825 0.0761 0.0000743
6 3 randomForest 77681 0.0741 0.0000723
7 3 plyr 393754 0.376 0.000367
8 5 whisker 28216 0.0269 0.0000263
9 6 Rcpp 5928 0.00565 0.00000552
10 7 hflights 2206029 2.10 0.00205
# … with 225,458 more rows
mutate(cran3, correct_size = size + 1000)
# A tibble: 225,468 × 4
ip_id package size correct_size
<int> <chr> <int> <dbl>
1 1 htmltools 80589 81589
2 2 tseries 321767 322767
3 3 party 748063 749063
4 3 Hmisc 606104 607104
5 4 digest 79825 80825
6 3 randomForest 77681 78681
7 3 plyr 393754 394754
8 5 whisker 28216 29216
9 6 Rcpp 5928 6928
10 7 hflights 2206029 2207029
# … with 225,458 more rows
summarize()
summarize(cran, avg_bytes = mean(size))
# A tibble: 1 × 1
avg_bytes
<dbl>
1 844086.
网友评论