首页 文章

在R中按行名称组合不同长度的数据帧列表

提问于
浏览
1

我有一个不同长度的数据框列表(df),按年份索引,以便数据的代理如下所示:

df

  $df1          
         X..i..
  1999     10
  1998     13 
  1997     14

 $df2
         X..i..
 1999      20
 1998      11

 $df3
         X..i..
 1999      17
 1998       8
 1997       9
 1996      19

我想使用并保留索引/ rownames将这些数据帧组合到单个数据帧中

以便:

df_all
 Index     df1   df2   df3
  1999      10    20    17
  1998      13    11     8
  1997      14    n/a    9
  1996      n/a   n/a   19

编辑:

smalldflist <- lapply(bai_df, function(i) head(i, 10))
dput(smalldflist)

产生以下输出:

结构(列表(IN_DonaldsonWoods_QUAL.txt =结构(列表(X..i .. = c(4.5528243479162,32.6474339976978,52.7116018957456,170.932582874866,227.0430440174,191.462399206825,226.94053541991,274.854835798233,536.457600434571,409.132933511232)) . . Name =“X ..我...“,row.names = c(”1725“,”1726“,”1727“,”1728“,”1729“,”1730“,”1731“,”1732“,”1733“,”1734“ ),class =“data.frame”),IN_DonaldsonWoods_QURU.txt = structure(list(X..i .. = c(4.33729067152776,5.72878688080428,13.0247658962315,22.0205798005054,25.9885943197615,18.9273551074104,43.5197887382031,58.2775710248884,72.9225976242458,73.0466756114972)), .Names =“X..i ..”,row.names = c(“1827”,“1828”,“1829”,“1830”,“1831”,“1832”,“1833”,“1834”, “1835”,“1836”),class =“data.frame”),IN_DonaldsonWoods_QUVE.txt = structure(list(X..i .. = c)(7.87253273859391,18.9481296742303,42.5055176960097,62.9980951594496,88.906442207264,74.2523230533691,106.911242713809,152.445167763284 ,192.399603839633,221.263660216113)),. Na mes =“X..i ..”,row.names = c(“1731”,“1732”,“1733”,“1734”,“1735”,“1736”,“1737”,“1738”,“ 1739“,”1740“),class =”data.frame“),IN_LillyDickey_QUAL.txt = structure(list(X..i .. = c)(8.29576810088555,17.2934968058816,31.2091720401804,33.8966066349882,47.6496887415004,32.9921546763907,82.2281435044324,108.068226885475, 103.894002151431,110.255812097949)), . Name =“X..i ..”,row.names = c(“1863”,“1864”,“1865”,“1866”,“1867”,“1868”,“1869 “,”1870“,”1871“,”1872“),类=”data.frame“),IN_LillyDickey_QUMO.txt =结构(列表(X..i .. = c)(3.42413493048312,8.0847630303073,19.6833503197648,13.791136218324,21.4638165402601 ,30.6707376168741,30.8789937938806,26.8661212585221,24.0732956549621,29.7872997715364)), . Name =“X..i ..”,row.names = c(“1867”,“1868”,“1869”,“1870”,“1871” ,“1872”,“1873”,“1874”,“1875”,“1876”),class =“data.frame”),IN_Pioneers_QUAL.txt =结构(列表(X..i .. = c(9.14340435634345, 23.5108626053757,33.8507393822465,46.1027716604662,57 .5247983011993,50.5892015892391,92.2448163706925,225.832932372368,278.367628044195,193.931508820174)),. Name =“X..i ..”,row.names = c(“1817”,“1818”,“1819”,“1820”,“ 1821“,”1822“,”1823“,”1824“,”1825“,”1826“),class =”data.frame“),IN_Pioneers_QURU.txt = structure(list(X..i .. = c( 122.443727611702,658.649900930018,830.471777578934,843.357139228152,1725.6495913006,1244.38668477703,973.00892131628,1294.7441782001,1717.18570086886,1676.63841798444)), . Name =“X..i ..”,row.names = c(“1861”,“1862”,“1863 “,”1864“,”1865“,”1866“,”1867“,”1868“,”1869“,”1870“),class =”data.frame“),OH_JohnsonWoods_QUAL.txt = structure(list(X . .i .. = c(1.9113449704439,3.39794661412248,5.32688450342693,6.41921626908008,11.0307601252838,13.0825342873437,14.843680070585,16.885746353779,20.1011664347289,19.853294774361)), . Name =“X..i ..”,row.names = c(“1626” ,“1627”,“1628”,“1629”,“1630”,“1631”,“1632”,“1633”,“1634”,“1635”),class =“data.frame”)),. Names = c(“IN_Dona ldsonWoods_QUAL.txt“,”IN_DonaldsonWoods_QURU.txt“,”IN_DonaldsonWoods_QUVE.txt“,”IN_LillyDickey_QUAL.txt“,”IN_LillyDickey_QUMO.txt“,”IN_Pioneers_QUAL.txt“,”IN_Pioneers_QURU.txt“,”OH_JohnsonWoods_QUAL.txt“))

3 回答

  • 1

    如果您拥有仅在全局环境中需要的数据,则可以尝试以下操作 . 首先,您在所有数据框中收集独特年份并创建主数据框,其中仅包含唯一年份 . 然后,将所有数据框放在一个列表中,并将每个数据框与 master 合并 . 由于 temp 中有主数据框,因此将其删除 . 最后,绑定所有数据帧并将长格式更改为宽格式 .

    library(tidyverse)
    
    # Create a data frame with all unique years
    
    master <- data.frame(year = mget(ls()) %>%
                                sapply(`[`, 1) %>%
                                as_vector %>% 
                                unique)
    
    # Merge each data frame with the master df
    temp <- mget(ls()) %>%
            lapply(function(x){full_join(x, master, by = "year")})
    
    # Remove the master df in the list
    temp[["master"]] <- NULL
    
    # Bind all dfs and make it wide.
    bind_rows(temp, .id = "data") %>%
    spread(key = data, value = value)
    
    #  year df1 df2 df3
    #1 1996  NA  NA  19
    #2 1997  14  NA   9
    #3 1998  13  11   8
    #4 1999  10  20  17
    

    数据

    df1 <- data.frame(year = c(1999, 1998, 1997),
                      value = c(10, 13, 14))
    
    df2 <- data.frame(year = c(1999, 1998),
                      value = c(20, 11))
    
    df3 <- data.frame(year = c(1999, 1998, 1997, 1996),
                      value = c(17, 8, 9, 19))
    
  • 1

    重新考虑@Djork显示的链合并,但要确保创建一个名为Index的实际列,索引等于 rownames() . 此外,根据df#重命名 X..1 列,这也避免了合并期间的重复列警告 . dfs下面相当于发布的smalldflist:

    dfs <- lapply(seq_along(dfs), function(i){
            dfs[[i]]$Index = rownames(dfs[[i]])        # CREATE INDEX
            colnames(dfs[[i]])[1] <- paste0("df", i)   # RENAME X..1 COLUMN
    
            return(dfs[[i]])
    })
    
    dfs[[1]]
    #             df1 Index
    # 1725   4.552824  1725
    # 1726  32.647434  1726
    # 1727  52.711602  1727
    # 1728 170.932583  1728
    # 1729 227.043044  1729
    # 1730 191.462399  1730
    # 1731 226.940535  1731
    # 1732 274.854836  1732
    # 1733 336.457600  1733
    # 1734 409.132934  1734
    
    finaldf <- Reduce(function(...) merge(..., by="Index", all=TRUE), dfs)
    
    finaldf
    #    Index    df1     df2     df3     df4    df5    df6   df7        df8
    # 1   1626     NA      NA      NA      NA     NA     NA    NA   1.911345
    # 2   1627     NA      NA      NA      NA     NA     NA    NA   3.397947
    # 3   1628     NA      NA      NA      NA     NA     NA    NA   5.326885
    # 4   1629     NA      NA      NA      NA     NA     NA    NA   6.419216
    # 5   1630     NA      NA      NA      NA     NA     NA    NA  11.030760
    # ...
    
  • 1

    您可以使用 Reducemerge 多个数据帧 . 设置 all = TRUE ,在没有匹配时添加NAs . 注意df是您设置的数据框列表, by 表示用于合并的列 . 因此,在数据框列表中,"Index"应该是每个数据框中年份列的名称 .

    Reduce(function(...) merge(..., by="Index", all=TRUE), df)
    

    并且由于@jazzuro提供了样本数据,这里是使用基础R中的 Reduce 的等效解决方案 . 在此示例中设置用于合并 by="year" 的列:

    df1 <- data.frame(year = c(1999, 1998, 1997),
                  value = c(10, 13, 14))
    
    df2 <- data.frame(year = c(1999, 1998),
                  value = c(20, 11))
    
    df3 <- data.frame(year = c(1999, 1998, 1997, 1996),
                  value = c(17, 8, 9, 19))
    df <- list(df1=df1, df2=df2, df3=df3)
    df_merge  <- Reduce(function(...) merge(..., by="year", all=TRUE), df)
    colnames(df_merge) <- c("Index", names(df))
    
    #   Index df1 df2 df3
    # 1 1996  NA  NA  19
    # 2 1997  14  NA   9
    # 3 1998  13  11   8
    # 4 1999  10  20  17
    

相关问题