-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInternal-PageRank-Calculation-2.r
58 lines (44 loc) · 1.65 KB
/
Internal-PageRank-Calculation-2.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(igraph)
library(dplyr)
library(ggplot2)
library(magrittr)
#library(ForceAtlas2)
file_outlinks <- 'C:/Users/evgeniy/Downloads/all_outlinks.csv'
website_url <- 'https//www.flyeralarm.com'
# transform raw internal page rank to page rank
map <- function(x, range = c(0,1), from.range=NA) {
if(any(is.na(from.range))) from.range <- range(x, na.rm=TRUE)
## check if all values are the same
if(!diff(from.range)) return(
matrix(mean(range), ncol=ncol(x), nrow=nrow(x),
dimnames = dimnames(x)))
## map to [0,1]
x <- (x-from.range[1])
x <- x/diff(from.range)
## handle single values
if(diff(from.range) == 0) x <- 0
## map from [0,1] to [range]
if (range[1]>range[2]) x <- 1-x
x <- x*(abs(diff(range))) + min(range)
x[x<min(range) | x>max(range)] <- NA
x
}
DF <- read.csv2(file_outlinks, header=TRUE, sep = ",", stringsAsFactors = F, skip=1 )
## we keep only link
DF <- filter(DF,grepl(website_url,Source) & Type=="HREF" & Follow=="true") %>%
select(Source,Destination)
DF <- as.data.frame(sapply(DF,gsub,pattern=website_url,replacement=""))
## adapt colnames and rownames
colnames(DF) <- c("Source","Destination")
rownames(DF) <- NULL
# generate graph with data.frame
graphObject = graph.data.frame(DF)
# calculate pagerank
urls_pagerank <- page.rank(graphObject, directed= TRUE, damping = 0.85) %>%
use_series("vector") %>%
sort(decreasing = TRUE) %>%
as.data.frame %>%
set_colnames("raw.internal.pagerank")
urls_pagerank$Address<-rownames(urls_pagerank)
rownames(urls_pagerank) <- NULL
urls_pagerank <- mutate(urls_pagerank, internal.pagerank = map(raw.internal.pagerank, c(1,10)))