0
点赞
收藏
分享

微信扫一扫

Adam学习14之Fasta在Adam中的初始存储格式NucleotideContigFragment


更多Adam学习代码等资料请见:​​https://github.com/xubo245/AdamLearning​​


1.Fasta格式在Adam里面的Avro模式是NucleotideContigFragment,具体在package org.bdgenomics.formats.avro包下,这个包在bdg-formats项目下,更多的可以看git:https://github.com/bigdatagenomics/bdg-formats


2.参考2中有具体的fasta的操作,截取结果为:


{"contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "description": "AC:KN707606.1  gi:734691250  LN:2200  rl:unplaced  M5:20c768ac79ca38077e5012ee0e5f8333  AS:hs38d1", "fragmentSequence": "ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtgttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgagccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTACCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGCACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGCCCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGGGGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggttatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaacaattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccaggagttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaaaaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAAGGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttcttctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGTGCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTGCTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAACCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCTACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAAACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaatagtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagattacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtctcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacCAACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGGCTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcctttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCTTCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTATCCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtctcgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctcccgcctcagcctcccgagtagctgggactacaggcactcg", "fragmentNumber": 0, "fragmentStartPosition": 0, "fragmentLength": 2200, "numberOfFragmentsInContig": 1}



3.创建空的:

代码:


/**
* @author xubo
* Fasta/Fastq/SAM/BAM read
*/
package org.bdgenomics.adamLocal.algorithms.test

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.bdgenomics.adam.rdd.ADAMContext
import org.bdgenomics.formats.avro.NucleotideContigFragment
import org.bdgenomics.formats.avro.Contig
//import scala.collection.parallel.Foreach

object NucleotideContigFragmentTest {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("FastaAndNucleotideContigFragment").setMaster("local")
val sc = new SparkContext(conf)
val ac = new ADAMContext(sc)
val builder = NucleotideContigFragment.newBuilder()
val contig = Contig.newBuilder
builder.setContig(contig.build)
val build1 = builder.build()
println(build1);
sc.stop

}
}


结果:


{"contig": {"contigName": null, "contigLength": null, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "description": null, "fragmentSequence": null, "fragmentNumber": null, "fragmentStartPosition": null, "fragmentLength": null, "numberOfFragmentsInContig": null}




附加:通过分析ADAMContext中loadFasta方法,里面有调用package org.bdgenomics.adam.converters下的FastaConverter,里面有段代码创建了改格式,故可以参考


def loadFasta(
filePath: String,
fragmentLength: Long): RDD[NucleotideClontigFragment] = {
val fastaData: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile(
filePath,
classOf[TextInputFormat],
classOf[LongWritable],
classOf[Text]
)
if (Metrics.isRecording) fastaData.instrument() else fastaData

val remapData = fastaData.map(kv => (kv._1.get, kv._2.toString))

FastaConverter(remapData, fragmentLength)
}




val fragments = sequencesAsFragments.zipWithIndex
.map(si => {
val (bases, index) = si

val contig = Contig.newBuilder
.setContigLength(sequenceLength)

val builder = NucleotideContigFragment.newBuilder()
.setFragmentSequence(bases)
.setFragmentNumber(index)
.setFragmentStartPosition(index * fragmentLength)
.setNumberOfFragmentsInContig(fragmentCount)
.setFragmentLength(bases.length)

// map over optional fields
name.foreach(contig.setContigName(_))
description.foreach(builder.setDescription(_))
builder.setContig(contig.build)
// build and return
builder.build()
})




参考:

【1】​​https://github.com/xubo245/AdamLearning​​


【3】https://github.com/bigdatagenomics/adam


举报

相关推荐

0 条评论