I am trying to two data frames (df_a and df_b) in R (essentially I want to repopulate df_a with the updated data contained within df_b). The columns in df_b are all present in df_a. Within df_b there is (important) redundancy in ref_transcript_name, ref_transcript_id, and ref_gene_name, but all values of qry_transcript_id are unique and have a one-to-one relationship with df_a. My assumption here is that a left_join() would do the trick. I've tried:
- df_c <- left_join(df_a, df_b)- here- df_cis identical to- df_b
- df_c <- left_join(df_a, df_b, by = "qry_transcript_id")- here- df_ccontains the three non-guide columns of- df_bas new columns of- df_c.
I'm clearly missing something fundamental about the join functions here, but essentially I want to populate (most of) the missing values in df_a with the values from df_b.
Here are my data:
dput(df_a)
structure(list(ref_gene_id = c("LOC108906895", NA, NA, "LOC108906894", 
"LOC108906894", "LOC108906894", "LOC108906889", "LOC108906897", 
"LOC108906897", NA, "LOC108906891", "LOC108906890", "LOC108906896", 
NA, "LOC108906893", NA, "LOC108906892", "LOC108905349", "LOC108905349", 
"LOC108905349", "LOC108905394", "LOC108905394", "LOC108905439", 
"LOC108905439", "LOC108905439", "LOC108905439", "LOC108905439", 
"LOC108905439", "LOC108905439", "LOC108905439", "LOC108905439", 
"LOC108905439", "LOC108905439", "LOC108905439", "LOC108905350", 
"LOC108905395", "LOC108905377", "LOC108905377", "LOC108905399", 
"LOC108905399", "LOC108905452", "LOC108905450", "LOC108905450", 
"LOC108905450", "LOC108905425", "LOC108905427", "LOC108905429", 
"LOC108905426", "LOC108905352", "LOC108905375", "LOC108905391", 
NA, NA, NA), qry_gene_id = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 
4L, 5L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 14L, 14L, 
15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 
16L, 17L, 18L, 19L, 19L, 20L, 20L, 21L, 23L, 23L, 23L, 22L, 24L, 
27L, 25L, 26L, 28L, 29L, 30L, 31L, 32L), .Label = c("G229", "G230", 
"G232", "G233", "G234", "G235", "G236", "G237", "G238", "G239", 
"G240", "G241", "G242", "G243", "G244", "G245", "G246", "G247", 
"G248", "G249", "G250", "G251", "G252", "G253", "G254", "G255", 
"G256", "G257", "G258", "G259", "G260", "G261"), class = "factor"), 
    ref_gene_name = c("uncharacterized LOC108906895", NA, NA, 
    "uncharacterized LOC108906894", "uncharacterized LOC108906894", 
    "uncharacterized LOC108906894", "myosin regulatory light chain sqh", 
    "sushi, von Willebrand factor type A, EGF and pentraxin domain-containing protein 1", 
    "sushi, von Willebrand factor type A, EGF and pentraxin domain-containing protein 1", 
    NA, "uncharacterized LOC108906891", "protein twisted gastrulation", 
    "paraplegin", NA, "fork head domain-containing protein crocodile", 
    NA, "forkhead box protein F1-like", "centrosomal protein of 135 kDa-like", 
    "centrosomal protein of 135 kDa-like", "centrosomal protein of 135 kDa-like", 
    "nuclear transcription factor Y subunit alpha", "nuclear transcription factor Y subunit alpha", 
    "homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2", 
    "homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2", 
    "homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2", 
    "homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2", 
    "homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2", 
    "homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2", 
    "myb-like protein X", "uncharacterized LOC108905395", "uncharacterized LOC108905377", 
    "uncharacterized LOC108905377", "uncharacterized LOC108905399", 
    "uncharacterized LOC108905399", "uncharacterized LOC108905452", 
    "uncharacterized LOC108905450", "uncharacterized LOC108905450", 
    "uncharacterized LOC108905450", "uncharacterized LOC108905425", 
    "N-alpha-acetyltransferase 38, NatC auxiliary subunit", "cytochrome c oxidase assembly factor 6 homolog", 
    "N-alpha-acetyltransferase 30A", "ESF1 homolog", "atypical kinase COQ8B, mitochondrial", 
    "calphotin-like", NA, NA, NA), gene_annotation = c("refseq", 
    "novel", "novel", "refseq", "refseq", "refseq", "refseq", 
    "refseq", "refseq", "novel", "refseq", "refseq", "refseq", 
    "novel", "refseq", "novel", "refseq", "refseq", "refseq", 
    "refseq", "refseq", "refseq", "refseq", "refseq", "refseq", 
    "refseq", "refseq", "refseq", "refseq", "refseq", "refseq", 
    "refseq", "refseq", "refseq", "refseq", "refseq", "refseq", 
    "refseq", "refseq", "refseq", "refseq", "refseq", "refseq", 
    "refseq", "refseq", "refseq", "refseq", "refseq", "refseq", 
    "refseq", "refseq", "novel", "novel", "novel"), ref_transcript_id = c("XR_001964310.2", 
    NA, NA, "XR_001964308.1", "XR_001964308.1", "XR_001964308.1", 
    "XM_018710327.1", "XM_018710334.2", "XM_018710334.2", NA, 
    "XM_018710330.2", "XM_018710328.1", "XM_018710333.1", NA, 
    "XM_018710332.1", NA, "XM_018710331.1", "XM_018708179.2", 
    "XM_018708179.2", "XM_018708179.2", "XM_018708228.2", "XM_018708229.2", 
    "XM_018708292.1", "XM_018708292.1", "XM_023457437.1", "XM_018708292.1", 
    "XM_018708292.1", "XM_018708292.1", "XM_018708292.1", "XM_018708292.1", 
    "XM_018708299.1", "XM_018708292.1", "XM_018708292.1", "XM_018708292.1", 
    "XM_018708180.1", "XM_018708231.1", "XM_018708208.2", "XM_023453940.1", 
    "XM_018708235.2", "XM_018708235.2", "XM_018708321.1", "XM_018708319.1", 
    "XM_018708318.1", "XM_018708318.1", "XM_018708263.1", "XM_018708266.1", 
    "XM_018708267.1", "XM_018708265.1", "XM_018708181.2", "XM_018708205.1", 
    "XM_018708226.1", NA, NA, NA), qry_transcript_id = structure(c(1L, 
    2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 
    28L, 29L, 30L, 32L, 27L, 31L, 33L, 34L, 35L, 36L, 37L, 38L, 
    39L, 40L, 41L, 43L, 44L, 45L, 42L, 46L, 49L, 47L, 48L, 50L, 
    51L, 52L, 53L, 54L), .Label = c("TU429", "TU430", "TU431", 
    "TU435", "TU436", "TU437", "TU438", "TU439", "TU440", "TU441", 
    "TU442", "TU443", "TU444", "TU445", "TU446", "TU447", "TU448", 
    "TU449", "TU450", "TU451", "TU452", "TU453", "TU454", "TU455", 
    "TU456", "TU457", "TU458", "TU459", "TU460", "TU461", "TU462", 
    "TU463", "TU464", "TU465", "TU466", "TU467", "TU468", "TU469", 
    "TU470", "TU471", "TU472", "TU473", "TU474", "TU475", "TU476", 
    "TU477", "TU478", "TU479", "TU480", "TU481", "TU482", "TU483", 
    "TU484", "TU485"), class = "factor"), ref_transcript_name = structure(c(30L, 
    NA, NA, 29L, 29L, 29L, 12L, 19L, 19L, NA, 28L, 18L, 17L, 
    NA, 6L, NA, 7L, 3L, 3L, 3L, 15L, 16L, 8L, 8L, 9L, 8L, 8L, 
    8L, 8L, 8L, 10L, 8L, 8L, 8L, 11L, 22L, 20L, 21L, 23L, 23L, 
    27L, 26L, 25L, 25L, 24L, 14L, 4L, 13L, 5L, 1L, 2L, NA, NA, 
    NA), .Label = c("atypical kinase COQ8B, mitochondrial", "calphotin-like", 
    "centrosomal protein of 135 kDa-like", "cytochrome c oxidase assembly factor 6 homolog, transcript variant X1", 
    "ESF1 homolog", "fork head domain-containing protein crocodile", 
    "forkhead box protein F1-like", "homeodomain-interacting protein kinase 2, transcript variant X1", 
    "homeodomain-interacting protein kinase 2, transcript variant X11", 
    "homeodomain-interacting protein kinase 2, transcript variant X8", 
    "myb-like protein X", "myosin regulatory light chain sqh", 
    "N-alpha-acetyltransferase 30A", "N-alpha-acetyltransferase 38, NatC auxiliary subunit", 
    "nuclear transcription factor Y subunit alpha, transcript variant X1", 
    "nuclear transcription factor Y subunit alpha, transcript variant X2", 
    "paraplegin", "protein twisted gastrulation", "sushi, von Willebrand factor type A, EGF and pentraxin domain-containing protein 1", 
    "uncharacterized LOC108905377, transcript variant X1", "uncharacterized LOC108905377, transcript variant X2", 
    "uncharacterized LOC108905395", "uncharacterized LOC108905399", 
    "uncharacterized LOC108905425, transcript variant X1", "uncharacterized LOC108905450, transcript variant X1", 
    "uncharacterized LOC108905450, transcript variant X2", "uncharacterized LOC108905452, transcript variant X2", 
    "uncharacterized LOC108906891", "uncharacterized LOC108906894", 
    "uncharacterized LOC108906895, transcript variant X2"), class = "factor"), 
    transcript_annotation = c("refseq", "novel", "novel", "refseq", 
    "refseq", "novel", "refseq", "refseq", "novel", "novel", 
    "novel", "novel", "refseq", "novel", "refseq", "novel", "refseq", 
    "refseq", "novel", "novel", "refseq", "refseq", "novel", 
    "novel", "refseq", "novel", "novel", "novel", "novel", "novel", 
    "refseq", "novel", "novel", "novel", "refseq", "refseq", 
    "refseq", "refseq", "refseq", "novel", "refseq", "refseq", 
    "refseq", "novel", "refseq", "refseq", "refseq", "novel", 
    "refseq", "refseq", "refseq", "novel", "novel", "novel"), 
    class_code = structure(c(1L, 5L, 5L, 1L, 4L, 3L, 1L, 4L, 
    3L, 5L, 3L, 3L, 1L, 5L, 1L, 5L, 1L, 1L, 3L, 3L, 1L, 1L, 3L, 
    3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 
    1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 5L, 5L, 
    5L), .Label = c("=", "c", "j", "k", "u"), class = "factor")), row.names = 432:485, class = "data.frame")
dput(df_b)
structure(list(qry_transcript_id = structure(1:10, .Label = c("TU118", 
"TU151", "TU255", "TU417", "TU430", "TU431", "TU485", "TU543", 
"TU687", "TU807"), class = "factor"), ref_transcript_name = structure(c(8L, 
1L, 2L, 7L, 4L, 4L, NA, 5L, 6L, 3L), .Label = c("apoptosis-stimulating of p53 protein 1 isoform X3", 
"basic proline-rich protein-like", "microtubule-associated protein 10-like", 
"protein dopey homolog PFC0245c-like", "protein sprint isoform X2", 
"serine/arginine repetitive matrix protein 2-like", "tigger transposable element-derived protein 1-like", 
"uncharacterized protein LOC108904829"), class = "factor"), ref_transcript_id = c("XP_018563024", 
"XP_023014054", "XP_019880584", "XP_018578361", "XP_024947529", 
"XP_024947524", NA, "XP_030753146", "XP_018575004", "XP_023028347"
), ref_gene_name = c("* uncharacterized protein LOC108904829", 
"* apoptosis-stimulating of p53 protein 1 ", "* basic proline-rich protein-like", 
"* tigger transposable element-derived protein 1-like", "* protein dopey homolog PFC0245c-like", 
"* protein dopey homolog PFC0245c-like", NA, "* protein sprint ", 
"* serine/arginine repetitive matrix protein 2-like", "* microtubule-associated protein 10-like"
)), row.names = c(NA, 10L), class = "data.frame")
Hopefully my subsetting doesn't create problems here, but in the full data set, all qry_transcript_ids in df_b are contained with df_a.
 
     
    