文献计量学系列29:关键词中的同/近义词合并
内容涵盖文档、作者、期刊、研究机构和国家等相关文献计量学指标分析
更多自定义函数
一次性获取较多文献计量指标
让学习更轻松!
学习力,才是最大的竞争力!扫码约我吧!
一、数据导入与处理
pacman::p_load(bibliometrix, tidyverse, pluralize, do)
D <- "E:/精鼎统计/savedrecs.txt "
M <- convert2df(D, dbsource = 'wos', format = 'plaintext')
M$DEID <- merge_field(M$DE,M$ID)#合并DE和ID,系列27内容封装成函数,具有简单清洗功能,如去重,复数变单数
m <- M[-which(is.na(M$DEID)),]#去除DEID为空值的行,并赋值给新数据m
二、自定义函数构建
由于我们需要逐步合并,需要多次运行相同的代码。如果一段代码需要重复运行三次及以上,那么我们就可以考虑封装这段代码,使其成为一个函数。
Synonyms_merge <- function(Terms, synonyms, sep = ';'){
listTERMS = strsplit(Terms,split= sep)
synonyms = toupper(synonyms)
listTERMS = lapply(listTERMS,function(l){
s = strsplit(synonyms,split=";")
for (i in 1:length(synonyms)){
ind = which(l %in% trim(s[[i]]))
if (length(ind)>0){l[ind] = trim(s[[i]][1])}
}
return(l)
})
TM = unlist(lapply(listTERMS,function(l){
l = paste0(l,collapse=";")
}))
return(TM)
}
Phrases2keywords <- function(Terms, phrases2keywords, sep = ';'){
listTERMS = strsplit(Terms,split= sep)
phrases2keywords = toupper(phrases2keywords)
listTERMS = lapply(listTERMS,function(l){
l = Replace(l,pattern = phrases2keywords)
l = unique(l)
return(l)
})
TM = unlist(lapply(listTERMS,function(l){
l = paste0(l,collapse=";")
}))
return(TM)
}
三、特殊的同/近义词合并
m$DEID <- gsub("[^[:alnum:][:blank:]\\/\\;\\&]", "", m$DEID) #除字母数字、空格、“/”、“;”、“&”外,去除其他所有字符
DEID[grep(' AND ',DEID$Tab),]#查看带有“AND”的词组
# Tab Freq
# 397 O AND H ISOTOPE 2
# 441 STABLE HYDROGEN AND OXYGEN ISOTOPE 2
# 495 18O 2H AND 3H 1
# 686 DELTA D AND DELTA O 18 1
# 692 DEUTERIUM AND OXYGEN ISOTOPE 1
# 766 EVENT AND PRE EVENT WATER 1
# 851 GROUNDWATER AND SURFACE 1
# 895 HYDROGEN AND OXYGEN 1
# 937 INFILTRATION AND INFLOW 1
# 956 ISOTOPIC AND GEOCHEMICAL TRACER 1
# 959 ISOTOPIC EXCHANGE BETWEEN LIQUID AND ICE 1
# 1009 LOW TECH AND LOW COST 1
# 1282 SNOW AND ICE MELT 1
# 1291 SNOWMELT AND GLACIER MELT DYNAMIC 1
# 1335 STABLE AND RADIOACTIVE ISOTOPE 1
# 1341 STABLE OXYGEN AND HYDROGEN ISOTOPE 1
# 1405 TEMPORAL AND SPATIAL VARIATION 1
# 1498 WATER ISOTOPES AND ELECTRICAL CONDUCTIVITY 1
synonyms_and <- 'STABLE HYDROGEN AND OXYGEN ISOTOPE;O AND H ISOTOPE;DELTA D AND DELTA O 18;DEUTERIUM AND OXYGEN ISOTOPE;STABLE OXYGEN AND HYDROGEN ISOTOPE'#需要合并的关键词向量
m$DEID <- Synonyms_merge(Terms = m$DEID, synonyms = synonyms_and) #所有的关键词统一转化为第一个“;”之前的关键词,即“STABLE HYDROGEN AND OXYGEN ISOTOPE”
phrase2words <- c('STABLE HYDROGEN AND OXYGEN ISOTOPE:DEUTERIUM;OXYGEN STABLE ISOTOPE',
'18O 2H AND 3H:OXYGEN STABLE ISOTOPE;DEUTERIUM;TRITIUM',
'EVENT AND PRE EVENT WATER:ENENT WATER;PRE EVENT WATER',
'GROUNDWATER AND SURFACE:GROUNDWATER;SURFACE-WATER',
'ISOTOPIC AND GEOCHEMICAL TRACER:ISOTOPIC TRACER;GEOCHEMICAL TRACER',
'LOW TECH AND LOW COST:LOW TECH;LOW COST',
'SNOW AND ICE MELT:SNOWMELT;ICE MELT',
'SNOWMELT AND GLACIER MELT DYNAMIC:SNOWMELT;GLACIER MELT',
'STABLE AND RADIOACTIVE ISOTOPE:STABLE ISOTOPE;RADIOACTIVE ISOTOPE',
'TEMPORAL AND SPATIAL VARIATION:TEMPORAL VARIATION;SPATIAL VARIATION',
'WATER ISOTOPES AND ELECTRICAL CONDUCTIVITY:WATER ISOTOPES;ELECTRICAL CONDUCTIVITY'
) #构建词组转多个关键词向量
m$DEID <- Phrases2keywords(Terms = m$DEID, phrases2keywords = phrase2words)#“:”前面的关键词转为“:”后面的关键词,即“STABLE HYDROGEN AND OXYGEN ISOTOPE”转变为“DEUTERIUM;OXYGEN STABLE ISOTOPE”。
DEID[grep('\\d',DEID$Tab),]#查看带有数字的关键词
# Tab Freq
# 23 O 18 32
# 25 OXYGEN 18 30
# 53 DELTA O 18 14
# 130 2 COMPONENT 6
# 136 HYDROGEN 2 6
# 178 3 COMPONENT 4
# 186 DELTA H 2 4
# 321 CARBON 13 2
# 356 GLACIER NO 1 2
# 492 1/F 1
# 493 18O 1
# 494 2 AGRICULTURAL HILLSLOPE 1
# 495 2 FORESTED 1
# 496 3 COMPONENT TRACER MODEL 1
# 606 CARBON 14 1
# 648 CO2 OUTGASSING 1
# 684 DELTA18O 1
# 888 HYDROGEN 3 1
# 1090 NO 1 1
# 1106 O 18 ISOTOPE 1
# 1119 OXYGEN 18 COMPOSITION 1
# 1127 PART 2 1
# 1218 RN 222 1
# 1247 SEAWATER SR 87/SR 86 1
# 1258 SF6 1
# 1327 SR 87/SR 86 RATIO 1
# 1376 SURFACE PARAMETERIZATION SIB2 1
# Synonyms_number <- c("OXYGEN STABLE ISOTOPE;O 18;OXYGEN 18;DELTA O 18;18O;DELTA18O;O 18 ISOTOPE;OXYGEN 18 COMPOSITION",
# "TWO COMPONET;2 COMPONENT",
# "DEUTERIUM;HYDROGEN 2;DELTA H 2",
# "THREE COMPONENT;3 COMPONENT;3 COMPONENT TRACER MODEL",
# "CARBON STABLE ISOTOPE;CARBON 13",
# "CARBON FOURTEEN;CARBON 14",
# "TRITIUM;HYDROGEN 3",
# "RADON ISOTOPE;RN 222",
# "SR ISOTOPE;SEAWATER SR 87/SR 86;SR 87/SR 86 RATIO")#数字关键词转换向量
#
# m$DEID <- Synonyms_merge(Terms = m$DEID,synonyms = Synonyms_number)#所有数字关键词均转换为第一个关键词
至此,带有“AND”和数字的特殊关键词已做处理。同时,大家可以根据研究的需要,对带有其他符号的特殊关键词做类似的处理。
五、小结
赞 (0)