library(tidyverse)
library(dplyr)
#自定义函数:各种转换的方法
countToTpm <- function(counts, effLen)
{
rate <- log(counts) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
countToFpkm <- function(counts, effLen)
{
N <- sum(counts)
exp( log(counts) + log(1e9) - log(effLen) - log(N) )
}
fpkmToTpm <- function(fpkm)
{
exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
countToCPM <- function( counts)
{
N <- sum(counts)
exp( log(counts) + log(1e6) - log(N) )
}
#读gtf文件,计算所有外显子的长度
gtf <- read_tsv("/home/data/t040334/rnaseq/ref/Mus_musculus.GRCm38.97.gtf",
comment="#",
col_names=c('chr','source','type','start','end',
'score','strand','phase','attributes')) %>%
filter(type=='exon') %>%
mutate(len = end - start + 1) %>%
select(start, end, attributes,len)
#计算基因的非冗余外显子的长度,即获得有效基因长度
gtf$attributes %>% str_extract(., "gene_id \"[\\w|\\.]+") %>%
str_remove(., "gene_id \"") -> gtf$gene_id
gtf %>% select(start, end, gene_id, len) %>%
distinct(start,end,gene_id, .keep_all = T) %>%
select(gene_id,len) %>% group_by(gene_id) %>%
summarise(est_len=sum(len)) -> gtf
#读取count的表达量矩阵,其中gene_id是和gtf文件一致的,然后和刚才计算得到的有效基因长度合并
expmat <- read_excel("gene_count_matrix.xlsx") %>%
inner_join(gtf , by = 'gene_id' ) %>%
drop_na()
#计算fpkm
fpkm=apply(expmat[,2:7],MARGIN = 2,function(x) countToFpkm(x,expmat$est_len ))
########第二种方法###########
library(tidyverse)
library(dplyr)
#读gtf文件,计算所有外显子的长度
gtf <- read_tsv("/home/data/t040334/rnaseq/ref/Mus_musculus.GRCm38.97.gtf",
comment="#",
col_names=c('chr','source','type','start','end',
'score','strand','phase','attributes')) %>%
filter(type=='exon') %>%
mutate(len = end - start + 1) %>%
select(start, end, attributes,len)
#计算基因的非冗余外显子的长度,即获得有效基因长度
gtf$attributes %>% str_extract(., "gene_id \"[\\w|\\.]+") %>%
str_remove(., "gene_id \"") -> gtf$gene_id
gtf %>% select(start, end, gene_id, len) %>%
distinct(start,end,gene_id, .keep_all = T) %>%
select(gene_id,len) %>% group_by(gene_id) %>%
summarise(est_len=sum(len)) -> gtf
gene_count_matrix <- read_excel("gene_count_matrix.xlsx")
merge<-left_join(gene_count_matrix,gtf,by="gene_id")#根据基因那列进行合并
merge <- na.omit(merge)#删除错误值行
merge=as.data.frame(merge)
row.names(merge)=merge$gene_id
merge=merge[,-1]
#TPM计算
kb <- merge$est_len / 1000
kb
countdata <- merge[,1:6]
rpk <- countdata / kb
rpk
tpm <- t(t(rpk)/colSums(rpk) * 1000000)
fpkm <- t(t(rpk)/colSums(countdata) * 10^6)
head(fpkm)
##两种方法计算结果一致
##yangxinyu 2023.2.20