## a set of nlp and webcrawling functions linkDownload = function(linkfile, crawldepth=0){ links = read.table(linkfile, header=F, colClasses="character") #only download uniques links = unique(links[,1]) cat('downloading ', length(links), ' unique links \n') destfolder = paste(linkfile, "_crawl", sep="") #system(paste('mkdir ', destfolder, sep="")) dir.create(destfolder) excludes = c("jpg", "gif", "png", "png);") inlcudes = c("htm", "html") for(link in links){ filename = strsplit(link, '/') destfile.name = paste(destfolder, '/', paste(filename[[1]][3], filename[[1]][length(filename[[1]])], sep="_"), sep="") extension = strsplit(destfile.name, '\\.') if(!(extension[[1]][length(extension[[1]])] %in% excludes)){ cat('excluded type \n') }else{ if(file.exists(destfile.name)){ cat('link already downloaded ', destfile.name, ' \n') }else{ try(download.file(link, destfile=destfile.name, method="internal", quiet=T), silent=T) if(file.exists(destfile.name)){ cat( ' ++++++ ', destfile.name, ' is good \n') print(file.info(destfile.name)[,"size"]) if(file.info(destfile.name)[,"size"]<1000){ cat('file too small...being removed \n') file.remove(destfile.name) } }else{ cat( ' ------ ', destfile.name, ' failed \n') } } } } if(crawldepth>0){ for(j in 1:crawldepth){ #run the system command which strips links and makes them into a file cat('extracting links from crawl', j, ' \n') system(paste('sh link_stripper.sh ', destfolder, sep="")) #run this function cat('crawling at depth', j, ' \n') linkDownload(paste(destfolder, '/all_links.txt', sep="")) } } } tokenize = function(documents){ #documents should come in the format of one line per file #excludes and their replacements for substitution in the documents exclude = c("\n", ">", "<", "\\.", ",", "!","\\?", "/", '\\"', '=') replacement = c(" ", " ", " ", "", "", "", "", "", "", "") documents = tolower(documents) for(i in 1:length(exclude)){ cat('excluding ', exclude[i], '\n') documents = gsub(exclude[i], replacement[i], documents) } #split the documents on each space documents.split = strsplit(documents, " ") cat('creating a bag of words \n') bagofwords = "" #create a bag of words and the frequency with which each word appears for(i in 1:length(documents)){ cat('bag of words for file', i, '\n') bagofwords = c(unique(documents.split[[i]]), bagofwords) } #get unique words bagofwords.unique = unique(bagofwords) #get index of duplicated words and create a list with counts for each word counts = rep(1, length(bagofwords.unique)) names(counts) = bagofwords.unique cat('counting words \n') for(word in bagofwords.unique){ counts[word] = length(which(bagofwords == word)) } print(summary(counts)) #eliminate words under and above count threshold upper = ceiling(.8*length(documents)) lower = ceiling(.2*length(documents)) bagofwords.final = bagofwords.unique[countslower] tf_idf = array(0, c(length(documents), length(bagofwords.final))) colnames(tf_idf) = bagofwords.final #go through and mark a 1 for present and 0 for not present cat('creating tf_idf matrix \n') for(i in 1:length(documents)){ tf_idf[i, which(bagofwords.final %in% documents.split[[i]])]=1 } tf_idf } docs2array = function(directory){ #takes all files in a directory...reads them and then makes an array...one line per document files.list = list.files(directory) documents = rep("", length(files.list)) i=1 for(iFile in files.list){ temprow = "" if(!file.info(paste(directory, '/', iFile, sep=""))["isdir"]){ for(item in scan(paste(directory, '/', iFile, sep=""), what="raw")){ temprow = paste(temprow, item, " ") } documents[i] = temprow }else{ documents[i] = "theo" } i = i+1 } documents }