学习别人写的代码,满眼的变量、函数和常量,很容易搞混,试着用R的 tidyverse
和datatable
自动梳理代码中的变量和函数,最终生成二维数据,用关系图类的包后单独的关系图软件出图。
1、原代码图样(python)
2、R语言清理代码
library(tidyverse)
library(data.table)
mydt <- read_csv("mydt.csv")
names(mydt) <- 'a'
setDT(mydt)
# 去除原文注释
mydt$a <- str_remove(mydt$a,"#.+")
# 删除空行
mydt <- mydt[a!='']
# 删除条件判断,避免干扰后面用'='分列
pt <- str_detect(mydt$a,"(==)|(!=)|(>=)|(<=)")
mydt <- mydt[!pt]
# 因为只关注变量间的关系,所以只保留赋值类语句
pt <- str_detect(mydt$a,"=")
mydt <- mydt[pt]
# 去除元组、列表、数字赋值、字符串赋值
pt <- str_detect(mydt$a,"(\\= [0-9]+)|(\\= ')|(\\= \\[)|(\\= \\([0-9]+)")
mydt[pt]
mydt <- mydt[!pt]
#去除标记变量
pt <- str_detect(mydt$a,"(\\= True)|(\\= False)")
mydt <- mydt[!pt]
# 分列
tmp <- str_split(mydt$a,"=",simplify = TRUE)
mydt$v <- tmp[,1] %>% str_trim()
mydt$r <- tmp[,2] %>% str_trim()
# 拆分V列中多个变量在同一行
pt <- str_detect(mydt$v,"\\,")
tmp <- mydt[pt]
mydt <- mydt[!pt]
# 因为本例中只有两个,就不费劲写循环了
str_count(tmp$v,"\\,")
# [1] 1 1 1 1 1 1 1 1 1 1 1 1 1
ta <- tmp
ta$v <- str_trim(str_remove(tmp$v,"\\w+\\,"))
tb <- tmp
tb$v <- str_trim(str_remove(tmp$v,"\\,.+"))
(tt <- rbind(ta,tb))
# 更新mydt
mydt <- rbind(mydt,tt)
# 将函数放到单独一列
mydt[,p:=str_extract(r,".+?\\(")]
mydt[,q:=str_remove(r,".+?\\(")]
# 将算术运算符替换为“,”,方便分列
mydt[,q:=str_replace_all(q,"\\*|\\+|\\-",",")]
#去掉括号
mydt[,q:=str_replace_all(q,"\\(|\\)","")]
mydt[,p:=str_replace_all(p,"\\(|\\)","")]
# 分列并更新
tmp <- str_split(mydt$q,",",simplify = TRUE) %>% as.data.frame() %>% setDT()
mydt <- cbind(mydt,tmp)
mydt <- mydt[,c("v","p","V1","V2","V3","V4","V5","V6","V7")]
# 宽数据变长数据
mydt <- pivot_longer(mydt,cols = 3:9,names_to = "A",values_to = "b") %>% unique()
setDT(mydt)
mydt <- mydt[!b==""]
mydt <- mydt[!is.na(p)]
write.csv(mydt,"mydt.csv")