Exporting collections to R (RDS) format
We are still unable to get Zotero items with their cached text into ITMS (Olga Scrivner's Interactive Text Mining Suite). Olga has written the following R-script to get exported csv-files (which contain the links to every folder where the pdf and the zotero-ft.cache are stored), but it doesn't work yet. If anybody with some coding knowledge would be willing to take a look and maybe help us out, that would be greatly appreciated.
### Export CSV file (no text, no notes)
### Load libraries (make sure they are all installed first)
library(readr)
library(qdapRegex)
library(tm)
library(stringr)
library(stringi)
library(qdap)
library(qdapRegex)
library(tibble)
library(mallet)
library(dplyr)
library(tidytext)
##select your csv file
mypath <- file.choose()
mydata <- read_csv(mypath)
shortdata <- mydata[,c("Key","Publication Year","Author","Title","Abstract Note","File Attachments")]
## loading text
id = vector()
lda.format <- vector()
terms <-list()
for (i in 1:nrow(shortdata)) {
path = shortdata['File Attachments'][[1]][i]
dotpath = strsplit(path, "/") # REPLACE by \ for PC - Let's hope there is not title with this symbol
paths = unlist(dotpath)[length(unlist(dotpath))]
uris.name = gsub(paths,".zotero-ft-cache", path) # replace pdf file name by cache file
text.scan <- scan(uris.name, what="character", sep="\n",blank.lines.skip = FALSE)
data=enc2utf8(text.scan)
loc.a<- grep("References|REFERENCES", data)
if (length(data[loc.a])>1){
text <- data[1:(loc.a[length(loc.a)]-1)]
}
else
text <- data[1:(loc.a-1)]
text <- paste(text, collapse = " ")
text <-gsub("-\\s+", "", text)
text.punct <- rm_email(text)
text.punct <- rm_url(text.punct)
text.punct <- rm_twitter_url(text.punct)
text.punct <-rm_citation(text.punct)
text.punct <-rm_citation(text.punct, pattern="@rm_citation3")
text.punct <-rm_citation(text.punct, pattern="@rm_citation2")
text.punct <-rm_round(text.punct)
text.punct <-rm_curly(text.punct)
text.punct <-rm_square(text.punct)
text.punct <- rm_bracket(text.punct)
text.punct <- strip(text.punct, char.keep="-",digit.remove = TRUE, apostrophe.remove = FALSE,
lower.case = TRUE)
text.punct <- removePunctuation(text.punct,
preserve_intra_word_contractions = TRUE,
preserve_intra_word_dashes = TRUE)
text.punct <- removeWords(text.punct, stopwords("en"))
text.punct <- gsub("\\s\\s+"," ",text.punct)
lda.format[i] <- text.punct
id[i] <- i
d <- data_frame(txt = text.punct)
freq <- d %>% unnest_tokens(word, txt) %>%
count(word) %>%
ungroup()
terms[[i]] <- freq
}
### adding other fields
titles <- mydata$Title
abstracts <- mydata$`Abstract Note`
authors <- mydata$Author
datetimes <- mydata$`Publication Year`
collection <- list(id = id, titles=titles, abstracts=abstracts, authors=authors,datetimes=datetimes,
terms=terms, text = lda.format)
### save the file and share with Olga
save_collection <- saveRDS(collection, "collection.rds")
### Export CSV file (no text, no notes)
### Load libraries (make sure they are all installed first)
library(readr)
library(qdapRegex)
library(tm)
library(stringr)
library(stringi)
library(qdap)
library(qdapRegex)
library(tibble)
library(mallet)
library(dplyr)
library(tidytext)
##select your csv file
mypath <- file.choose()
mydata <- read_csv(mypath)
shortdata <- mydata[,c("Key","Publication Year","Author","Title","Abstract Note","File Attachments")]
## loading text
id = vector()
lda.format <- vector()
terms <-list()
for (i in 1:nrow(shortdata)) {
path = shortdata['File Attachments'][[1]][i]
dotpath = strsplit(path, "/") # REPLACE by \ for PC - Let's hope there is not title with this symbol
paths = unlist(dotpath)[length(unlist(dotpath))]
uris.name = gsub(paths,".zotero-ft-cache", path) # replace pdf file name by cache file
text.scan <- scan(uris.name, what="character", sep="\n",blank.lines.skip = FALSE)
data=enc2utf8(text.scan)
loc.a<- grep("References|REFERENCES", data)
if (length(data[loc.a])>1){
text <- data[1:(loc.a[length(loc.a)]-1)]
}
else
text <- data[1:(loc.a-1)]
text <- paste(text, collapse = " ")
text <-gsub("-\\s+", "", text)
text.punct <- rm_email(text)
text.punct <- rm_url(text.punct)
text.punct <- rm_twitter_url(text.punct)
text.punct <-rm_citation(text.punct)
text.punct <-rm_citation(text.punct, pattern="@rm_citation3")
text.punct <-rm_citation(text.punct, pattern="@rm_citation2")
text.punct <-rm_round(text.punct)
text.punct <-rm_curly(text.punct)
text.punct <-rm_square(text.punct)
text.punct <- rm_bracket(text.punct)
text.punct <- strip(text.punct, char.keep="-",digit.remove = TRUE, apostrophe.remove = FALSE,
lower.case = TRUE)
text.punct <- removePunctuation(text.punct,
preserve_intra_word_contractions = TRUE,
preserve_intra_word_dashes = TRUE)
text.punct <- removeWords(text.punct, stopwords("en"))
text.punct <- gsub("\\s\\s+"," ",text.punct)
lda.format[i] <- text.punct
id[i] <- i
d <- data_frame(txt = text.punct)
freq <- d %>% unnest_tokens(word, txt) %>%
count(word) %>%
ungroup()
terms[[i]] <- freq
}
### adding other fields
titles <- mydata$Title
abstracts <- mydata$`Abstract Note`
authors <- mydata$Author
datetimes <- mydata$`Publication Year`
collection <- list(id = id, titles=titles, abstracts=abstracts, authors=authors,datetimes=datetimes,
terms=terms, text = lda.format)
### save the file and share with Olga
save_collection <- saveRDS(collection, "collection.rds")
Brenton - thanks for getting back to me. Here you'll find the test csv file, the R console output and the rds file it created. I feel bad about posting so much code in here, hence the link. There are a couple of errors in the console as you'll see
>ata=enc2utf8(text.scan)
> print(text.scan)
[1] "%PDF-1.6"
[2] "%âãÏÓ"
[3] "48 0 obj <</Linearized 1/L 103414/O 50/E 57535/N 14/T 102407/H [ 796 361]>>"
[4] "endobj"
[41] "7ÿóððpó\004s·rßåqã~Æ“È\023&*ËPËxŸ\u0081¡GêÚ!Þæ¤\005,\b\003\030\u008f4.銌¹~Èh&˜\025Ïqâ‘Ã%W\u009dõ-)…"
[42] "™JÓÂ\017\031YL0™è]ìõÃ-\030ÌÒž\005g=””ñ\u0090òX:õl\u008d²æ\0340ëCOŸ" [43] "¯JÐ\u009dÛ¢\037z–-îÌ\u0090Œç¸\003¶G©££\001H±¥AéŽ\016\u0090h\0062Ç%\003B§e H¢ê0v\003r\005…T:ðz\024\b4\031\030kÃ\u0081´\030\020K€E\f\030ø\030]8\027ä\035ôæ¨\001r\030Î2ëð8ð:\030\033qHM0g\022\\a:g_PâA¶¹×\034ä\031´ ¡£ÏÀØ{\024\024J@|\016 À"
[44] "endstream"
[45] "endobj"
> text.scan <- scan(uris.name, what="character", sep="\n",blank.lines.skip = FALSE)
Read 74 items
Warning message:
In scan(uris.name, what = "character", sep = "\n", blank.lines.skip = FALSE) :
embedded nul(s) found in input
Maybe seeing this may help you in putting us on the right track?