# this download cannot download full page and reason unknown # this must be added to setting chinese Sys.setlocale(category = 'LC_ALL', 'Chinese') #Sys.setlocale(, 'English') #options("encoding" = "native.enc") #options("encoding" = "UTF-8") setwd("C:/Users/User/Pictures/sexpage") #library(audio) library(rvest) library(crayon) ligSilver <- make_style("#889988") pageHeader="https://www.xvideos.com/pornstars-index/" pageTail="" className = ".thumb a, .with-sub" titleName = "Pornstars" theFilename = paste0("Xvideos", titleName, ".html") wholePage = character() # remember to remove addr = 0:372 lentocpage = length(addr) cat("\nlentocpage: ",lentocpage,"\n") ProcessStartTime = Sys.time() cat(format(Sys.time(), "%H:%M:%OS"),"\n") dhms <- function(t){ paste(t %/% (60*60*24), "day" ,paste(formatC(t %/% (60*60) %% 24, width = 2, format = "d", flag = "0") ,formatC(t %/% 60 %% 60, width = 2, format = "d", flag = "0") ,formatC(t %% 60, width = 2, format = "d", flag = "0") ,sep = ":" ) ) } for(i in 1:length(addr)){ cat(i, "/", length(addr), " ") #guess_encoding(pagesource) #pagesource <- read_html(paste0(pageHeader,addr[i],pageTail), encoding = "UTF-8") #url = paste0(pageHeader,addr[i],pageTail) url = paste0(pageHeader,addr[i],pageTail) cat(url, "\n") pagesource <- read_html(url) itemList <- html_nodes(pagesource, className) itemList = as.character(itemList) itemList = gsub('