- 3. 伪Facebook用户数据
- 4. 练习:用户生日直方图
- 8. 练习:分面
- 11. 练习:好友数量
- 12. 限制轴
- 14. 调整组距
- 15. 忽略NA观测值
- 16. 按性别划分的统计学(by())
- 17. 使用时长
- 19. 练习:用户年龄
- 22. 转换数据
- 23. 添加定标层
- 24. 频率多边形
- 25. 练习:网页端上的“点赞”数
- 26. 箱线图
- 27. 练习:箱线图、四分位数和友谊
- 28. 练习:符合逻辑
3. 伪Facebook用户数据
> getwd()
[1] "C:/Users/utane/Documents"
> setwd("C:/Users/utane/OneDrive/udacity/23-R")
> getwd()
[1] "C:/Users/utane/OneDrive/udacity/23-R"
# 列出当前目录下的文件名
> list.files()
[1] "lesson3_student.nb.html" "lesson3_student.rmd" "pseudo_facebook.tsv"
# 导入数据后,显示有99003行数据,15个变量
> pf <- read.csv("pseudo_facebook.tsv",sep='\t')
# 该数据集中的变量名
> names(pf)
[1] "userid" "age" "dob_day" "dob_year"
[5] "dob_month" "gender" "tenure" "friend_count"
[9] "friendships_initiated" "likes" "likes_received" "mobile_likes"
[13] "mobile_likes_received" "www_likes" "www_likes_received"
4. 练习:用户生日直方图
更多关于直方图的参考 如何读懂直方图并在 R 中进行使用
# 安装并加载图形库
install.packages('ggplot2')
library(ggplot2)
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31)
显示图形如下:
8. 练习:分面
通过下面的代码,可以实现通过月份分面:
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31) + facet_wrap(~dob_month,ncol = 3)
facet_wrap
和facet_grid
用于切面
关于Facets更多介绍,可以参考Facets (ggplot2)
11. 练习:好友数量
> pf <- read.csv("pseudo_facebook.tsv",sep='\t')
> names(pf)
[1] "userid" "age" "dob_day" "dob_year"
[5] "dob_month" "gender" "tenure" "friend_count"
[9] "friendships_initiated" "likes" "likes_received" "mobile_likes"
[13] "mobile_likes_received" "www_likes" "www_likes_received"
> ggplot(aes(x = friend_count), data = pf) +
+ geom_histogram(binwidth = 1)
这种数据叫做long tail data,长尾数据,即有某个非常大的值,我们要研究的是大部分1000人以下好友的用户,所以要对其进行限制。
12. 限制轴
上面出现了一个5000的值,导致1000以下的图形看不清楚,需要对X轴的值进行限制:
> ggplot(aes(x = friend_count), data = pf) +
+ geom_histogram() +
+ scale_x_continuous(limits = c(0, 1000))
14. 调整组距
上面的图形以125为组距,不是很容易看清差异,通过调整组距能更好反应数据变化。
# 添加组距,即0到1000,以50为步长
> ggplot(aes(x = friend_count), data = pf) +
+ geom_histogram() +
+ scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
# 以性别gender进行分面
> ggplot(aes(x = friend_count), data = pf) +
+ geom_histogram() +
+ scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
图形分别如下所示:
15. 忽略NA观测值
观察上面gender进行分面后,最后一组是无效值组,需要将该组过滤掉。 即将data变量进行过滤,如下:
> ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
+ geom_histogram() +
+ scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + facet_wrap(~gender)
16. 按性别划分的统计学(by())
通过by函数获得统计值,by函数接收三个参数
- 变量
- 类别变量,用于划分子集的指标列表
- 函数
> table(pf$gender)
female male
40254 58574
> by(pf$friend_count,pf$gender,summary)
pf$gender: female
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 37 96 242 244 4923
-----------------------------------------------------------------------------------
pf$gender: male
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 27 74 165 182 4917
>
17. 使用时长
用户使用时长的直方图,第一个图的单位是天,后面是年,注意binwidth的区别,是以x轴上的为基准。
> ggplot(aes(x = tenure), data = pf) +
+ geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')
> ggplot(aes(x = tenure/365), data = pf) +
+ geom_histogram(binwidth = .25, color = 'black', fill = '#F79420')
还可以给X轴限定范围和步长:
> ggplot(aes(x = tenure/365), data = pf) +
+ geom_histogram(binwidth = .25, color = 'black', fill = '#F79420') +
+ scale_x_continuous(breaks = seq(1,7,1),limits=c(0,7))
19. 练习:用户年龄
组距: binwidth / 间断:break / 标签:label
> ggplot(aes(x = age),data = pf) + geom_histogram(binwidth = 1, color = 'black', fill = '#F79420') +
scale_x_continuous(breaks = seq(1,150,5),limits=c(0,150))
注意上面binwidth为1,即每个bar的宽度为1,而breaks是X轴上数字的标度,以1到150岁之间,5岁为一个标度。
22. 转换数据
> setwd("C:/Users/utane/OneDrive/udacity/23-R")
> pf <- read.csv("pseudo_facebook.tsv",sep='\t')
> names(pf)
[1] "userid" "age" "dob_day"
[4] "dob_year" "dob_month" "gender"
[7] "tenure" "friend_count" "friendships_initiated"
[10] "likes" "likes_received" "mobile_likes"
[13] "mobile_likes_received" "www_likes" "www_likes_received"
>
> library(ggplot2)
> qplot(x=friend_count,data=pf)
>
创建包含以下三个直方图的 1 个列: • 好友数 • 使用 log10 转化的好友数 • 使用 sqrt 转化的好友数 在一个图中创建所有三个直方图之前,你需要运行以下代码安装:
install.packages('gridExtra')
library(gridExtra)
> summary(pf$friend_count)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 31.0 82.0 196.4 206.0 4923.0
> summary(log10(pf$friend_count))
Min. 1st Qu. Median Mean 3rd Qu. Max.
-Inf 1.491 1.914 -Inf 2.314 3.692
> summary(log10(pf$friend_count+1))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 1.505 1.919 1.868 2.316 3.692
> summary(sqrt(pf$friend_count))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 5.568 9.055 11.088 14.353 70.164
> library(gridExtra)
>
> q1 = qplot(x=friend_count,data=pf)
> q2 = qplot(x=log10(pf$friend_count+1),data=pf)
> q3 = qplot(x=sqrt(pf$friend_count),data=pf)
>
> grid.arrange(q1, q2, q3,ncol=1)
- 另一种方式也可以画出相同的图形:
> p1 <- ggplot(aes(x = friend_count),data=pf) + geom_histogram()
> p2 <- p1 + scale_x_log10()
> p3 <- p1 + scale_x_sqrt()
> grid.arrange(p1, p2, p3,ncol=1)
图形如下:
23. 添加定标层
> logScale <- qplot(x = log10(friend_count),data = pf)
> countScale <- ggplot(aes(x=friend_count),data = pf) +
+ geom_histogram() + scale_x_log10()
> library(ggplot2)
> library(gridExtra)
> grid.arrange(logScale,countScale,ncol = 2)
24. 频率多边形
> qplot(x = friend_count,data = subset(pf,!is.na(gender)),
+ binwidth = 10) +
+ scale_x_continuous(lim = c(0,1000),breaks = seq(0,1000,50)) +
+ facet_wrap(~gender)
qplot(x = friend_count,data = subset(pf,!is.na(gender)),
binwidth = 10,geom = "freqpoly",color = gender) +
scale_x_continuous(lim = c(0,1000),breaks = seq(0,1000,50))
- 使用频数多边形确定哪个性别在万维网 (www_likes) 上获得的点赞数量更多
qplot(x = www_likes,data = subset(pf,!is.na(gender)),
geom = "freqpoly",color=gender) +
scale_x_continuous() +
scale_x_log10()
25. 练习:网页端上的“点赞”数
by(pf$www_likes,pf$gender,sum)
pf$gender: female [1] 3507665
pf$gender: male [1] 1430175
by(pf$www_likes,pf$gender,summary)
pf$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 87.14 25.00 14865.00
pf$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 24.42 2.00 12903.00
26. 箱线图
- 方法1
qplot(x=gender,y=friend_count,
data = subset(pf,!is.na(gender)),
geom = "boxplot") +
scale_y_continuous(lim = c(0,1000),breaks = seq(0,1000,50))
- 方法2
qplot(x=gender,y=friend_count,
data = subset(pf,!is.na(gender)),
geom = "boxplot",ylim = c(0,1000))
27. 练习:箱线图、四分位数和友谊
qplot(x=gender,y=friend_count,
data = subset(pf,!is.na(gender)),
geom = "boxplot") +
coord_cartesian(ylim = c(0,250))
by(pf$friend_count,pf$gender,summary)
pf$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0 37 96 242 244 4923
pf$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0 27 74 165 182 4917
- 问题
谁发起了更多的交友请求,男士还是女士?
qplot(x=gender,y=friendships_initiated,
data = subset(pf,!is.na(gender)),geom="boxplot") +
coord_cartesian(ylim = c(0,150))
by(pf$friendships_initiated,pf$gender,summary)
pf$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 19.0 49.0 113.9 124.8 3654.0
pf$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 15.0 44.0 103.1 111.0 4144.0
28. 练习:符合逻辑
summary(pf$mobile_likes)
mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes >0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 0.0 4.0 106.1 46.0 25111.0
0 1 35056 63947
由于 mobile_check_in 是一个因子变量,因此 sum() 函数将无法运行。你可以使用 length() 函数来确定向量中的值数量。 可以创建 mobile_check_in 来保存布尔值。sum() 函数可处理布尔值(true 为 1,false 为 0)。
- 求用手机登录人数的百分比
sum(pf$mobile_check_in == 1) / length(pf$mobile_check_in)