美文网首页
Week1: swirl教程 1: Manipulating D

Week1: swirl教程 1: Manipulating D

作者: Chamberzero | 来源:发表于2021-10-15 01:42 被阅读0次

读取及查看数据库基本情况

mydf <- read.csv(file = path2csv,stringsAsFactors = FALSE) # 读取数据

dim(mydf) #查看维度
[1] 225468     11

head(mydf) #查看数据库头
 X       date     time   size r_version r_arch
1 1 2014-07-08 00:54:41  80589     3.1.0 x86_64
2 2 2014-07-08 00:59:53 321767     3.1.0 x86_64
3 3 2014-07-08 00:47:13 748063     3.1.0 x86_64
4 4 2014-07-08 00:48:05 606104     3.1.0 x86_64
5 5 2014-07-08 00:46:50  79825     3.0.2 x86_64
6 6 2014-07-08 00:48:04  77681     3.1.0 x86_64
 X       date     time   size r_version r_arch
1 1 2014-07-08 00:54:41  80589     3.1.0 x86_64
2 2 2014-07-08 00:59:53 321767     3.1.0 x86_64
3 3 2014-07-08 00:47:13 748063     3.1.0 x86_64
4 4 2014-07-08 00:48:05 606104     3.1.0 x86_64
5 5 2014-07-08 00:46:50  79825     3.0.2 x86_64
6 6 2014-07-08 00:48:04  77681     3.1.0 x86_64

载入dplyr及查看信息

library(dplyr)
packageVersion("dplyr")
[1] ‘1.0.7’

转换表格

 cran <- tbl_df(mydf)
cran
# A tibble: 225,468 × 11
       X date  time    size r_version r_arch r_os  package
   <int> <chr> <chr>  <int> <chr>     <chr>  <chr> <chr>  
 1     1 2014… 00:5… 8.06e4 3.1.0     x86_64 ming… htmlto…
 2     2 2014… 00:5… 3.22e5 3.1.0     x86_64 ming… tseries
 3     3 2014… 00:4… 7.48e5 3.1.0     x86_64 linu… party  
 4     4 2014… 00:4… 6.06e5 3.1.0     x86_64 linu… Hmisc  
 5     5 2014… 00:4… 7.98e4 3.0.2     x86_64 linu… digest 
 6     6 2014… 00:4… 7.77e4 3.1.0     x86_64 linu… random…
 7     7 2014… 00:4… 3.94e5 3.1.0     x86_64 linu… plyr   
 8     8 2014… 00:4… 2.82e4 3.0.2     x86_64 linu… whisker
 9     9 2014… 00:5… 5.93e3 NA        NA     NA    Rcpp   
10    10 2014… 00:1… 2.21e6 3.0.2     x86_64 linu… hfligh…

select()

select(cran,ip_id,package,country)
# A tibble: 225,468 × 3
   ip_id package      country
   <int> <chr>        <chr>  
 1     1 htmltools    US     
 2     2 tseries      US     
 3     3 party        US     
 4     3 Hmisc        US     
 5     4 digest       CA     
 6     3 randomForest US     
 7     3 plyr         US     
 8     5 whisker      US     
 9     6 Rcpp         CN     
10     7 hflights     US     
# … with 225,458 more rows

select(cran, r_arch:country)
# A tibble: 225,468 × 5
   r_arch r_os      package      version country
   <chr>  <chr>     <chr>        <chr>   <chr>  
 1 x86_64 mingw32   htmltools    0.2.4   US     
 2 x86_64 mingw32   tseries      0.10-32 US     
 3 x86_64 linux-gnu party        1.0-15  US     
 4 x86_64 linux-gnu Hmisc        3.14-4  US     
 5 x86_64 linux-gnu digest       0.6.4   CA     
 6 x86_64 linux-gnu randomForest 4.6-7   US     
 7 x86_64 linux-gnu plyr         1.8.1   US     
 8 x86_64 linux-gnu whisker      0.3-2   US     
 9 NA     NA        Rcpp         0.10.4  CN     
10 x86_64 linux-gnu hflights     0.1     US     
# … with 225,458 more rows

select(cran, -time) 
# A tibble: 225,468 × 10
       X date          size r_version r_arch r_os  package
   <int> <chr>        <int> <chr>     <chr>  <chr> <chr>  
 1     1 2014-07-08   80589 3.1.0     x86_64 ming… htmlto…
 2     2 2014-07-08  321767 3.1.0     x86_64 ming… tseries
 3     3 2014-07-08  748063 3.1.0     x86_64 linu… party  
 4     4 2014-07-08  606104 3.1.0     x86_64 linu… Hmisc  
 5     5 2014-07-08   79825 3.0.2     x86_64 linu… digest 
 6     6 2014-07-08   77681 3.1.0     x86_64 linu… random…
 7     7 2014-07-08  393754 3.1.0     x86_64 linu… plyr   
 8     8 2014-07-08   28216 3.0.2     x86_64 linu… whisker
 9     9 2014-07-08    5928 NA        NA     NA    Rcpp   
10    10 2014-07-08 2206029 3.0.2     x86_64 linu… hfligh…
# … with 225,458 more rows, and 3 more variables:
#   version <chr>, country <chr>, ip_id <int>

reverse

-(5:20)
 [1]  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17
[14] -18 -19 -20

filter()

filter(cran, package == "swirl") 

# A tibble: 820 × 11
       X date  time    size r_version r_arch r_os  package
   <int> <chr> <chr>  <int> <chr>     <chr>  <chr> <chr>  
 1    27 2014… 00:1… 105350 3.0.2     x86_64 ming… swirl  
 2   156 2014… 00:2…  41261 3.1.0     x86_64 linu… swirl  
 3   358 2014… 00:1… 105335 2.15.2    x86_64 ming… swirl  
 4   593 2014… 00:5… 105465 3.1.0     x86_64 darw… swirl  
 5   831 2014… 00:5… 105335 3.0.3     x86_64 ming… swirl  
 6   997 2014… 00:3…  41261 3.1.0     x86_64 ming… swirl  
 7  1023 2014… 00:3… 106393 3.1.0     x86_64 ming… swirl  
 8  1144 2014… 00:0… 106534 3.0.2     x86_64 linu… swirl  
 9  1402 2014… 00:4…  41261 3.1.0     i386   ming… swirl  
10  1424 2014… 00:4… 106393 3.1.0     x86_64 linu… swirl  
# … with 810 more rows, and 3 more variables:
#   version <chr>, country <chr>, ip_id <int>

filter(cran, r_version == "3.1.1", country == "US")

# A tibble: 1,588 × 11
       X date  time    size r_version r_arch r_os  package
   <int> <chr> <chr>  <int> <chr>     <chr>  <chr> <chr>  
 1  2216 2014… 00:4… 3.85e5 3.1.1     x86_64 darw… colors…
 2 17332 2014… 03:3… 1.97e5 3.1.1     x86_64 darw… httr   
 3 17465 2014… 03:2… 2.33e4 3.1.1     x86_64 darw… snow   
 4 18844 2014… 03:5… 1.91e5 3.1.1     x86_64 darw… maxLik 
 5 30182 2014… 04:1… 7.77e4 3.1.1     i386   ming… random…
 6 30193 2014… 04:0… 2.35e6 3.1.1     i386   ming… ggplot2
 7 30195 2014… 04:0… 2.99e5 3.1.1     i386   ming… fExtre…
 8 30217 2014… 04:3… 5.68e5 3.1.1     i386   ming… rJava  
 9 30245 2014… 04:1… 5.27e5 3.1.1     i386   ming… LPCM   
10 30354 2014… 04:3… 1.76e6 3.1.1     i386   ming… mgcv   
# … with 1,578 more rows, and 3 more variables:
#   version <chr>, country <chr>, ip_id <int>

filter(cran, r_version <= "3.0.2", country == "IN")

# A tibble: 4,139 × 11
       X date  time    size r_version r_arch r_os  package
   <int> <chr> <chr>  <int> <chr>     <chr>  <chr> <chr>  
 1   348 2014… 00:4… 1.02e7 3.0.0     x86_64 ming… BH     
 2  9990 2014… 02:1… 3.97e5 3.0.2     x86_64 linu… equate…
 3  9991 2014… 02:1… 1.19e5 3.0.2     x86_64 linu… ggdend…
 4  9992 2014… 02:1… 8.18e4 3.0.2     x86_64 linu… dfcrm  
 5 10022 2014… 02:1… 1.56e6 2.15.0    x86_64 ming… RcppAr…
 6 10023 2014… 02:1… 1.18e6 2.15.1    i686   linu… foreca…
 7 10189 2014… 02:3… 9.09e5 3.0.2     x86_64 linu… editru…
 8 10199 2014… 02:3… 1.78e5 3.0.2     x86_64 linu… energy 
 9 10200 2014… 02:3… 5.18e4 3.0.2     x86_64 linu… ENmisc 
10 10201 2014… 02:3… 6.52e4 3.0.2     x86_64 linu… entropy
# … with 4,129 more rows, and 3 more variables:
#   version <chr>, country <chr>, ip_id <int>

filter(cran,size > 100500, r_os == "linux-gnu")
# A tibble: 33,683 × 11
       X date  time    size r_version r_arch r_os  package
   <int> <chr> <chr>  <int> <chr>     <chr>  <chr> <chr>  
 1     3 2014… 00:4… 7.48e5 3.1.0     x86_64 linu… party  
 2     4 2014… 00:4… 6.06e5 3.1.0     x86_64 linu… Hmisc  
 3     7 2014… 00:4… 3.94e5 3.1.0     x86_64 linu… plyr   
 4    10 2014… 00:1… 2.21e6 3.0.2     x86_64 linu… hfligh…
 5    11 2014… 00:1… 5.27e5 3.0.2     x86_64 linu… LPCM   
 6    12 2014… 00:1… 2.35e6 2.14.1    x86_64 linu… ggplot2
 7    14 2014… 00:1… 3.10e6 3.0.2     x86_64 linu… Rcpp   
 8    15 2014… 00:1… 5.68e5 3.1.0     x86_64 linu… rJava  
 9    16 2014… 00:1… 1.60e6 3.1.0     x86_64 linu… RSQLite
10    18 2014… 00:2… 1.87e5 3.1.0     x86_64 linu… ipred  
# … with 33,673 more rows, and 3 more variables:
#   version <chr>, country <chr>, ip_id <int>

filter(cran,!is.na(r_version))
# A tibble: 207,205 × 11
       X date  time    size r_version r_arch r_os  package
   <int> <chr> <chr>  <int> <chr>     <chr>  <chr> <chr>  
 1     1 2014… 00:5… 8.06e4 3.1.0     x86_64 ming… htmlto…
 2     2 2014… 00:5… 3.22e5 3.1.0     x86_64 ming… tseries
 3     3 2014… 00:4… 7.48e5 3.1.0     x86_64 linu… party  
 4     4 2014… 00:4… 6.06e5 3.1.0     x86_64 linu… Hmisc  
 5     5 2014… 00:4… 7.98e4 3.0.2     x86_64 linu… digest 
 6     6 2014… 00:4… 7.77e4 3.1.0     x86_64 linu… random…
 7     7 2014… 00:4… 3.94e5 3.1.0     x86_64 linu… plyr   
 8     8 2014… 00:4… 2.82e4 3.0.2     x86_64 linu… whisker
 9    10 2014… 00:1… 2.21e6 3.0.2     x86_64 linu… hfligh…
10    11 2014… 00:1… 5.27e5 3.0.2     x86_64 linu… LPCM   
# … with 207,195 more rows, and 3 more variables:
#   version <chr>, country <chr>, ip_id <int>

arrange()

cran2 <- select(cran,size:ip_id)

# A tibble: 225,468 × 8
     size r_version r_arch r_os         package     version country ip_id
    <int> <chr>     <chr>  <chr>        <chr>       <chr>   <chr>   <int>
 1  80589 3.1.0     x86_64 mingw32      htmltools   0.2.4   US          1
 2 180562 3.0.2     x86_64 mingw32      yaml        2.1.13  US          1
 3 190120 3.1.0     i386   mingw32      babel       0.2-6   US          1
 4 321767 3.1.0     x86_64 mingw32      tseries     0.10-32 US          2
 5  52281 3.0.3     x86_64 darwin10.8.0 quadprog    1.5-5   US          2
 6 876702 3.1.0     x86_64 linux-gnu    zoo         1.7-11  US          2
 7 321764 3.0.2     x86_64 linux-gnu    tseries     0.10-32 US          2
 8 876702 3.1.0     x86_64 linux-gnu    zoo         1.7-11  US          2
 9 321768 3.1.0     x86_64 mingw32      tseries     0.10-32 US          2
10 784093 3.1.0     x86_64 linux-gnu    strucchange 1.5-0   US          2
# … with 225,458 more rows

arrange(cran2,ip_id)

# A tibble: 225,468 × 8
     size r_version r_arch r_os         package     version country ip_id
    <int> <chr>     <chr>  <chr>        <chr>       <chr>   <chr>   <int>
 1  80589 3.1.0     x86_64 mingw32      htmltools   0.2.4   US          1
 2 180562 3.0.2     x86_64 mingw32      yaml        2.1.13  US          1
 3 190120 3.1.0     i386   mingw32      babel       0.2-6   US          1
 4 321767 3.1.0     x86_64 mingw32      tseries     0.10-32 US          2
 5  52281 3.0.3     x86_64 darwin10.8.0 quadprog    1.5-5   US          2
 6 876702 3.1.0     x86_64 linux-gnu    zoo         1.7-11  US          2
 7 321764 3.0.2     x86_64 linux-gnu    tseries     0.10-32 US          2
 8 876702 3.1.0     x86_64 linux-gnu    zoo         1.7-11  US          2
 9 321768 3.1.0     x86_64 mingw32      tseries     0.10-32 US          2
10 784093 3.1.0     x86_64 linux-gnu    strucchange 1.5-0   US          2
# … with 225,458 more rows

arrange(cran2, desc(ip_id))

# A tibble: 225,468 × 8
      size r_version r_arch r_os         package      version country ip_id
     <int> <chr>     <chr>  <chr>        <chr>        <chr>   <chr>   <int>
 1    5933 NA        NA     NA           CPE          1.4.2   CN      13859
 2  569241 3.1.0     x86_64 mingw32      multcompView 0.1-5   US      13858
 3  228444 3.1.0     x86_64 mingw32      tourr        0.5.3   NZ      13857
 4  308962 3.1.0     x86_64 darwin13.1.0 ctv          0.7-9   CN      13856
 5  950964 3.0.3     i386   mingw32      knitr        1.6     CA      13855
 6   80185 3.0.3     i386   mingw32      htmltools    0.2.4   CA      13855
 7 1431750 3.0.3     i386   mingw32      shiny        0.10.0  CA      13855
 8 2189695 3.1.0     x86_64 mingw32      RMySQL       0.9-3   US      13854
 9 4818024 3.1.0     i386   mingw32      igraph       0.7.1   US      13853
10  197495 3.1.0     x86_64 mingw32      coda         0.16-1  US      13852
# … with 225,458 more rows

mutate()

mutate(cran3, size_mb = size / 2^20)

# A tibble: 225,468 × 4
   ip_id package         size size_mb
   <int> <chr>          <int>   <dbl>
 1     1 htmltools      80589 0.0769 
 2     2 tseries       321767 0.307  
 3     3 party         748063 0.713  
 4     3 Hmisc         606104 0.578  
 5     4 digest         79825 0.0761 
 6     3 randomForest   77681 0.0741 
 7     3 plyr          393754 0.376  
 8     5 whisker        28216 0.0269 
 9     6 Rcpp            5928 0.00565
10     7 hflights     2206029 2.10   
# … with 225,458 more rows

mutate(cran3,size_mb = size / 2^20,size_gb = size_mb / 2^10)

# A tibble: 225,468 × 5
   ip_id package         size size_mb    size_gb
   <int> <chr>          <int>   <dbl>      <dbl>
 1     1 htmltools      80589 0.0769  0.0000751 
 2     2 tseries       321767 0.307   0.000300  
 3     3 party         748063 0.713   0.000697  
 4     3 Hmisc         606104 0.578   0.000564  
 5     4 digest         79825 0.0761  0.0000743 
 6     3 randomForest   77681 0.0741  0.0000723 
 7     3 plyr          393754 0.376   0.000367  
 8     5 whisker        28216 0.0269  0.0000263 
 9     6 Rcpp            5928 0.00565 0.00000552
10     7 hflights     2206029 2.10    0.00205   
# … with 225,458 more rows

mutate(cran3, correct_size  = size + 1000)
# A tibble: 225,468 × 4
   ip_id package         size correct_size
   <int> <chr>          <int>        <dbl>
 1     1 htmltools      80589        81589
 2     2 tseries       321767       322767
 3     3 party         748063       749063
 4     3 Hmisc         606104       607104
 5     4 digest         79825        80825
 6     3 randomForest   77681        78681
 7     3 plyr          393754       394754
 8     5 whisker        28216        29216
 9     6 Rcpp            5928         6928
10     7 hflights     2206029      2207029
# … with 225,458 more rows

summarize()

summarize(cran, avg_bytes = mean(size))
# A tibble: 1 × 1
  avg_bytes
      <dbl>
1   844086.

相关文章

网友评论

      本文标题:Week1: swirl教程 1: Manipulating D

      本文链接:https://www.haomeiwen.com/subject/gscpoltx.html