forked from mayaman/UFOSightings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis_ufo.R
244 lines (201 loc) · 8.66 KB
/
analysis_ufo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# Call in libraries
library(tidyverse)
library(lubridate)
library(ggplot2)
library(ggmap)
library(maps)
library(mapdata)
# load map data
states <- map_data("county")
usa <- map_data("usa")
# read in data, take out unnecessary rows
pop = read.csv("state_pops.csv")[,1:3]
data = read.csv("new_ufo_dat.csv")
data = data[-43775,]
# prepare data to create pop density plot
data.bystate = filter(data, Country == "us") %>% group_by(State)
data.bystate = filter(data.bystate, State != "hi")
data.bystate = filter(data.bystate, State != "ak")
state.count = data.bystate %>% count(State)
merged = merge(pop, state.count, by = "State")
merged$HundredThPpl = merged$pop/100000
merged$Density = merged$n/merged$HundredThPpl
all = merge(states, merged, by.x = "region", by.y = "StateFull")
# plot the density by state
p <- ggplot()
p <- p + geom_polygon(data=all, aes(x=long,y=lat,group=group,fill=all$Density), colour = "white") +
scale_fill_continuous(low = "thistle2", high = "darkred", guide = "colorbar")
p
# plot the location of sightings
ggplot() +
geom_polygon(data = usa, aes(x = long, y = lat, group = group), fill = NA, color = "black") +
coord_fixed(1.3) + geom_point(data = data.bystate, aes(x = Longitude, y = Latitude), color = "red", size = 0.1)
# kmeans clustering
fit.km <- kmeans(data[,10:11], 100, 1000)
data$cluster <- fit.km$cluster
# plot world points
world <- map_data("world")
ggplot() +
geom_polygon(data = world, aes(x=long, y = lat, group = group), fill = NA, color = "black") +
coord_fixed(1.3) + geom_point(data = data, aes(x = Longitude, y = Latitude), color = "red", size = 0.1)
# Filter to get lower 48
US <- filter(data, State != "")
US <- filter(US, State != "ak")
US <- filter(US, State != "hi")
US <- filter(US, Latitude < 50)
US <- filter(US, Latitude > 20)
US <- filter(US, Longitude > -130)
US <- filter(US, Longitude < -65)
# plot density
p <- ggplot() + geom_polygon(data = usa, aes(x = long, y = lat, group = group), fill = NA, color = "black") + coord_fixed(1.3)
overlay <- geom_density2d(data = US, aes(x = Longitude, y = Latitude))
p + overlay + stat_density2d(data = US, aes( x = Longitude, y = Latitude, fill = ..level.., alpha = ..level..), size = 0.01, bins = 16, geom = 'polygon') +
scale_fill_gradient(low = "green", high = "red") + scale_alpha(range = c(0.00, 0.30), guide = FALSE) + ggtitle("Density of Sightings Across the United States")
# Combine shapes to add up their frequencies
unknown <- filter(data, Shape %in% c("", "other", "unknown"))
change <- filter(data, Shape %in% c("changed", "changing"))
triangle <- filter(data, Shape %in% c("chevron", "delta", "triangle", "cone", "pyramid"))
cylinder <- filter(data, Shape %in% c("cigar", "cylinder", "rectangle"))
circle <- filter(data, Shape %in% c("circle", "disk", "dome", "egg", "hexagon", "oval", "round", "sphere", "teardrop"))
light <- filter(data, Shape %in% c("flare", "flash", "light"))
fireball <- filter(data, Shape %in% c("fireball"))
formation <- filter(data, Shape %in% c("formation"))
# new mexico sightings
nm = filter(data, State == "nm")
nm = filter(nm, Latitude > 30)
nm.region <- subset(states, region %in% c("new mexico"))
ggplot(data = nm.region) +
geom_polygon(aes(x = long, y = lat, group = group), fill = "palegreen", color = "black") +
coord_fixed(1.3) +
geom_point(data = nm, aes(x=Longitude,y=Latitude)) +
geom_point(aes(x=-105.243,y=33.968), col="red", size=7) +
ggtitle("New Mexico Sightings (Roswell in Red)")
# southern california region sightings
ca.region <- subset(states, region %in% c("california"))
ca.region.southern <- subset(ca.region, subregion %in% c("san luis obispo", "san bernardino", "santa barbara", "los angeles", "orange", "riverside","imperial","san diego","kern", "ventura"))
california <- filter(US, Latitude < 36)
california <- filter(california, Latitude > 32)
california <- filter(california, Longitude < -114)
california <- filter(california, Longitude > -122)
ggplot(data = ca.region.southern) +
geom_polygon(aes(x = long, y = lat, group = group), fill = "palegreen", color="black") +
coord_fixed(1.3) +
geom_point(data = california, aes(x=Longitude,y=Latitude)) +
geom_point(aes(x=-117.7118,y=34.0977),col = "blue", size = 8) +
geom_point(aes(x=-117.7118,y=34.0977),col = "orange", size = 7) +
ggtitle("Sightings in Southern California (Claremont in orange)")
# area51
nv = filter(data, State == "nv")
nv = filter(nv, Longitude < -100)
nv.region <- subset(states, region %in% c("nevada"))
ggplot(data = nv.region) +
geom_polygon(aes(x = long, y = lat, group = group), fill = "palegreen", color = "black") +
coord_fixed(1.3) +
geom_point(data = nv, aes(x=Longitude,y=Latitude)) +
geom_point(aes(x=-115.811111,y=37.235), col="red", size=3) +
ggtitle("Nevada Sightings (Area 51 in Red)")
# plant 42
ca.region <- subset(states, region %in% c("california"))
ca.region.southern <- subset(ca.region, subregion %in% c("san luis obispo", "san bernardino", "santa barbara", "los angeles", "orange", "riverside","imperial","san diego","kern", "ventura"))
plant42 = filter(data, State == "ca")
plant42 <- filter(plant42, Latitude < 36)
plant42 <- filter(plant42, Latitude > 32)
plant42 <- filter(plant42, Longitude < -114)
plant42 <- filter(plant42, Longitude > -122)
ggplot(data = ca.region.southern) +
geom_polygon(aes(x = long, y = lat, group = group), fill = "palegreen", color="black") +
coord_fixed(1.3) +
geom_point(data = plant42, aes(x=Longitude,y=Latitude)) +
geom_point(aes(x=-117.7118,y=34.0977),col = "blue", size = 3) +
geom_point(aes(x=-117.7118,y=34.0977),col = "orange", size = 2) +
ggtitle("Sightings near Plant42 (Plant42 in orange)")
# plot by cluster
g <- ggplot(data, aes(y=Latitude,x=Longitude,color=as.factor(cluster))) + geom_point()
g <- g + geom_point(aes(y=Latitude,x=Longitude))
g
# Prepare data for Maya, redo kmeans, use lots of itr to converge to decent soln
data.time <- filter(data, Year < 2018)
fit.km <- kmeans(data.time[,10:11], centers = 500, iter.max = 100000)
totals <- c()
for(i in 1:500) {
totals[i] = table(fit.km$cluster)[i]
}
maya.dat <- as.data.frame(fit.km$centers)
maya.dat$Magnitude <- totals
# write.csv maya.dat for her use
# Topic Modeling
library(tidytext)
library(tm)
library("tmap")
library("tmaptools")
library("sf")
library("leaflet")
library("scales")
library(topicmodels)
library(NLP)
library(ggstance)
library(ggthemes)
#library(broom)
text_corpus <- Corpus(VectorSource(as.vector(data$Report)))
text_corpus <- tm_map(text_corpus, content_transformer(removePunctuation))
text_corpus <- tm_map(text_corpus, content_transformer(tolower))
text_corpus <- tm_map(text_corpus, content_transformer(stripWhitespace))
text_corpus <- tm_map(text_corpus, removeWords, stopwords("english"))
text_corpus <- tm_map(text_corpus, content_transformer(stemDocument), language="english")
DTM <- DocumentTermMatrix(text_corpus, control= list(wordLengths = c(2, Inf)))
DTM <- removeSparseTerms(DTM, .990)
rowTotals <- apply(DTM, 1, sum)
DTM <- DTM[rowTotals > 0, ]
(freq.terms <- findFreqTerms(DTM, lowfreq=15))
term.freq <- rowSums(as.matrix(DTM))
term.freq <- subset(term.freq, term.freq >=5)
df2 <- data.frame(term = names(term.freq), freq = term.freq)
numTopics <- 5
#Run Latent Dirichlet Allocation (LDA) using Gibbs Sampling
#set burn in
burnin <-4
#set iterations
iter<-20
#thin the spaces between samples
thin <- 5
#set random starts at 5
nstart <-4
#use random integers as seed
seed <- list(22,12,48843,48314)
# return the highest probability as the result
best <-TRUE
#set number of topics
k <-5
#run the LDA model
ldaOut <- LDA(DTM,k, method="Gibbs", control=
list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
gen_topics <- as.matrix(topics(ldaOut))
gen_terms <- as.matrix(terms(ldaOut, 10))
tidy_lda <- tidy(ldaOut)
tidy_lda
top_terms <- tidy_lda %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
lda_gamma <- tidy(ldaOut, matrix = "gamma")
lda_gamma
ggplot(top_terms, aes(beta, term, fill = as.factor(topic))) +
geom_barh(stat = "identity", show.legend = FALSE, alpha = 0.8) +
labs(title = "Top 10 Terms in Each LDA Topic",
y = NULL, x = "beta") +
facet_wrap(~topic, ncol = 2, scales = "free") +
theme_tufte(base_size = 13, ticks = FALSE) +
scale_x_continuous(expand=c(0,0)) +
theme(strip.text=element_text(hjust=0)) +
theme(plot.caption=element_text(size=9))
ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
geom_histogram(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~topic, ncol = 3) +
scale_y_log10() +
labs(title = "Distribution of Probability for Each Topic",
y = NULL, x = "gamma") +
theme_minimal(base_size = 13) +
theme(strip.text=element_text(hjust=0)) +
theme(plot.caption=element_text(size=9))