Haste makes waste

Uda-DataAnalysis-23--探索单一变量

Posted on By lijun

3. 伪Facebook用户数据

> getwd()
[1] "C:/Users/utane/Documents"
> setwd("C:/Users/utane/OneDrive/udacity/23-R")
> getwd()
[1] "C:/Users/utane/OneDrive/udacity/23-R"

# 列出当前目录下的文件名
> list.files()
[1] "lesson3_student.nb.html" "lesson3_student.rmd"     "pseudo_facebook.tsv"   

# 导入数据后,显示有99003行数据,15个变量 
> pf <- read.csv("pseudo_facebook.tsv",sep='\t')

# 该数据集中的变量名
> names(pf)
 [1] "userid"                "age"                   "dob_day"               "dob_year"             
 [5] "dob_month"             "gender"                "tenure"                "friend_count"         
 [9] "friendships_initiated" "likes"                 "likes_received"        "mobile_likes"         
[13] "mobile_likes_received" "www_likes"             "www_likes_received"   

4. 练习:用户生日直方图

更多关于直方图的参考 如何读懂直方图并在 R 中进行使用


# 安装并加载图形库
install.packages('ggplot2')
library(ggplot2)

ggplot(aes(x = dob_day), data = pf) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous(breaks = 1:31)

显示图形如下:

image

8. 练习:分面

通过下面的代码,可以实现通过月份分面:

ggplot(aes(x = dob_day), data = pf) + 
   geom_histogram(binwidth = 1) + 
   scale_x_continuous(breaks = 1:31) + facet_wrap(~dob_month,ncol = 3)

image

facet_wrapfacet_grid用于切面

image

关于Facets更多介绍,可以参考Facets (ggplot2)

11. 练习:好友数量

> pf <- read.csv("pseudo_facebook.tsv",sep='\t')
> names(pf)
 [1] "userid"                "age"                   "dob_day"               "dob_year"             
 [5] "dob_month"             "gender"                "tenure"                "friend_count"         
 [9] "friendships_initiated" "likes"                 "likes_received"        "mobile_likes"         
[13] "mobile_likes_received" "www_likes"             "www_likes_received"   
> ggplot(aes(x = friend_count), data = pf) + 
+   geom_histogram(binwidth = 1)

image

这种数据叫做long tail data,长尾数据,即有某个非常大的值,我们要研究的是大部分1000人以下好友的用户,所以要对其进行限制。

12. 限制轴

上面出现了一个5000的值,导致1000以下的图形看不清楚,需要对X轴的值进行限制:

> ggplot(aes(x = friend_count), data = pf) + 
+   geom_histogram() + 
+   scale_x_continuous(limits = c(0, 1000))

image

14. 调整组距

上面的图形以125为组距,不是很容易看清差异,通过调整组距能更好反应数据变化。


# 添加组距,即0到1000,以50为步长
> ggplot(aes(x = friend_count), data = pf) + 
+    geom_histogram() + 
+    scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))

# 以性别gender进行分面
> ggplot(aes(x = friend_count), data = pf) + 
+    geom_histogram() + 
+    scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +  facet_wrap(~gender)

图形分别如下所示:

image

image

15. 忽略NA观测值

观察上面gender进行分面后,最后一组是无效值组,需要将该组过滤掉。 即将data变量进行过滤,如下:

> ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) + 
+    geom_histogram() + 
+    scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +  facet_wrap(~gender)

image

16. 按性别划分的统计学(by())

通过by函数获得统计值,by函数接收三个参数

  • 变量
  • 类别变量,用于划分子集的指标列表
  • 函数
> table(pf$gender)
female   male 
 40254  58574 

> by(pf$friend_count,pf$gender,summary)
pf$gender: female
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      37      96     242     244    4923 
----------------------------------------------------------------------------------- 
pf$gender: male
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      27      74     165     182    4917 
> 

17. 使用时长

用户使用时长的直方图,第一个图的单位是天,后面是年,注意binwidth的区别,是以x轴上的为基准。


>  ggplot(aes(x = tenure), data = pf) + 
+    geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')

> ggplot(aes(x = tenure/365), data = pf) + 
+     geom_histogram(binwidth = .25, color = 'black', fill = '#F79420')

image

image

还可以给X轴限定范围和步长:

> ggplot(aes(x = tenure/365), data = pf) + 
+     geom_histogram(binwidth = .25, color = 'black', fill = '#F79420') + 
+ scale_x_continuous(breaks = seq(1,7,1),limits=c(0,7))

image

19. 练习:用户年龄

组距: binwidth / 间断:break / 标签:label

> ggplot(aes(x = age),data = pf) +  geom_histogram(binwidth = 1, color = 'black', fill = '#F79420') +
  scale_x_continuous(breaks = seq(1,150,5),limits=c(0,150))

image

注意上面binwidth为1,即每个bar的宽度为1,而breaks是X轴上数字的标度,以1到150岁之间,5岁为一个标度。

22. 转换数据

> setwd("C:/Users/utane/OneDrive/udacity/23-R")
> pf <- read.csv("pseudo_facebook.tsv",sep='\t')
> names(pf)
 [1] "userid"                "age"                   "dob_day"              
 [4] "dob_year"              "dob_month"             "gender"               
 [7] "tenure"                "friend_count"          "friendships_initiated"
[10] "likes"                 "likes_received"        "mobile_likes"         
[13] "mobile_likes_received" "www_likes"             "www_likes_received"   


> 
> library(ggplot2)
> qplot(x=friend_count,data=pf)
> 

创建包含以下三个直方图的 1 个列: • 好友数 • 使用 log10 转化的好友数 • 使用 sqrt 转化的好友数 在一个图中创建所有三个直方图之前,你需要运行以下代码安装:

install.packages('gridExtra') 
library(gridExtra) 
> summary(pf$friend_count)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    31.0    82.0   196.4   206.0  4923.0 
> summary(log10(pf$friend_count))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   -Inf   1.491   1.914    -Inf   2.314   3.692 
> summary(log10(pf$friend_count+1))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   1.505   1.919   1.868   2.316   3.692 
> summary(sqrt(pf$friend_count))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   5.568   9.055  11.088  14.353  70.164 
> library(gridExtra) 
> 
> q1 = qplot(x=friend_count,data=pf)
> q2 = qplot(x=log10(pf$friend_count+1),data=pf)
> q3 = qplot(x=sqrt(pf$friend_count),data=pf)
> 
> grid.arrange(q1, q2, q3,ncol=1)
  • 另一种方式也可以画出相同的图形:
> p1 <- ggplot(aes(x = friend_count),data=pf) + geom_histogram()
> p2 <- p1 + scale_x_log10()
> p3 <- p1 + scale_x_sqrt()

> grid.arrange(p1, p2, p3,ncol=1)

图形如下:

image

23. 添加定标层

> logScale <- qplot(x = log10(friend_count),data = pf)
> countScale <- ggplot(aes(x=friend_count),data = pf) + 
+ geom_histogram() + scale_x_log10()

> library(ggplot2)
> library(gridExtra) 

> grid.arrange(logScale,countScale,ncol = 2)

image

24. 频率多边形

> qplot(x = friend_count,data = subset(pf,!is.na(gender)),
+ binwidth = 10) +
+ scale_x_continuous(lim = c(0,1000),breaks = seq(0,1000,50)) +
+ facet_wrap(~gender)

image

qplot(x = friend_count,data = subset(pf,!is.na(gender)),
binwidth = 10,geom = "freqpoly",color = gender) +
scale_x_continuous(lim = c(0,1000),breaks = seq(0,1000,50))

image

  • 使用频数多边形确定哪个性别在万维网 (www_likes) 上获得的点赞数量更多
qplot(x = www_likes,data = subset(pf,!is.na(gender)),
      geom = "freqpoly",color=gender) + 
      scale_x_continuous() + 
      scale_x_log10()

image

25. 练习:网页端上的“点赞”数

by(pf$www_likes,pf$gender,sum)

pf$gender: female [1] 3507665


pf$gender: male [1] 1430175

by(pf$www_likes,pf$gender,summary)

pf$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 87.14 25.00 14865.00


pf$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 24.42 2.00 12903.00

26. 箱线图

  • 方法1
qplot(x=gender,y=friend_count,
      data = subset(pf,!is.na(gender)),
      geom = "boxplot") + 
scale_y_continuous(lim = c(0,1000),breaks = seq(0,1000,50))

image

  • 方法2
qplot(x=gender,y=friend_count,
      data = subset(pf,!is.na(gender)),
      geom = "boxplot",ylim = c(0,1000))

image

27. 练习:箱线图、四分位数和友谊

qplot(x=gender,y=friend_count,
      data = subset(pf,!is.na(gender)),
      geom = "boxplot") +
  coord_cartesian(ylim = c(0,250))

by(pf$friend_count,pf$gender,summary)

image

pf$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0 37 96 242 244 4923


pf$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0 27 74 165 182 4917

  • 问题谁发起了更多的交友请求,男士还是女士?
qplot(x=gender,y=friendships_initiated,
      data = subset(pf,!is.na(gender)),geom="boxplot") +
  coord_cartesian(ylim = c(0,150))

by(pf$friendships_initiated,pf$gender,summary)

image

pf$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 19.0 49.0 113.9 124.8 3654.0


pf$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 15.0 44.0 103.1 111.0 4144.0

28. 练习:符合逻辑

summary(pf$mobile_likes)

mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes >0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 0.0 4.0 106.1 46.0 25111.0

0 1 35056 63947

由于 mobile_check_in 是一个因子变量,因此 sum() 函数将无法运行。你可以使用 length() 函数来确定向量中的值数量。 可以创建 mobile_check_in 来保存布尔值。sum() 函数可处理布尔值(true 为 1,false 为 0)。

  • 求用手机登录人数的百分比
sum(pf$mobile_check_in == 1) / length(pf$mobile_check_in)