"Fossies" - the Fresh Open Source Software Archive

Member "augustus-3.3.3/src/load2db.cc" (22 May 2019, 17548 Bytes) of package /linux/misc/augustus-3.3.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "load2db.cc" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.2_vs_3.3.3.

    1 /*
    2  * load2db.cc
    3  *
    4  * License: Artistic License, see file LICENSE.TXT or 
    5  *          https://opensource.org/licenses/artistic-license-1.0
    6  * 
    7  * Description: Load the sequences from a flat file in fasta format into a database.
    8  */
    9 
   10 // Project includes
   11 #include "fasta.hh"
   12 #include "projectio.hh"
   13 #include "hints.hh"
   14 
   15 // standard C/C++ includes
   16 #include <string>
   17 #include <iostream>
   18 #include <fstream>
   19 #include <getopt.h>     /* for getopt_long; standard getopt is in unistd.h */
   20 #include <stdlib.h>     /* for exit() */
   21 #include <mysql++.h>
   22 #include <exception>
   23 #include <ssqls.h>
   24 
   25 //#include <boost/iostreams/filtering_stream.hpp>
   26 //#include <boost/iostreams/filter/gzip.hpp>
   27 //#include <boost/iostreams/copy.hpp>
   28 
   29 using namespace std;
   30 //using boost::iostreams::filtering_istream;
   31 //using boost::iostreams::gzip_decompressor;
   32 
   33 //sql_create_6(genomes,1,6,int,seqid,string,sequence,string,seqname,int,start,int,end,string,species)
   34 
   35 int chunksize = 50000;
   36 mysqlpp::Connection con;
   37 
   38 void printUsage();
   39 void connectDB(string dbaccess);
   40 void createTableGenomes();
   41 void createTableSpeciesnames();
   42 void createTableSeqnames();
   43 void createTableHints();
   44 void createTableFeatureTypes();
   45 int getSpeciesID(string species);
   46 int getSeqNr(char* seqname,int speciesid);
   47 int insertSeqName(char* seqname,int speciesid);
   48 int insertSeq(string sequence, char* name, int length, int speciesid);
   49 void insertHint(Feature &f, string species);
   50 void checkConsistency(); // checks consistency of the 'hints' data with the 'genomes' data
   51 void removeDuplicates();
   52 
   53 /*
   54  * main
   55  */
   56 int main( int argc, char* argv[] ){
   57     int c;
   58     int help = 0;
   59     string species;
   60     string dbaccess;
   61     string filename;
   62     static struct option long_options[] = {
   63         {"species", 1, 0, 's'},
   64         {"dbaccess", 1, 0, 'd'},
   65         {"help", 0, 0, 'h'},
   66     {"chunksize", 1, 0, 'c'},
   67         {NULL, 0, NULL, 0}
   68     };
   69     int option_index = 0;
   70     while ((c = getopt_long(argc, argv, "s:d:hc:", long_options, &option_index)) != -1) {
   71         switch (c) {
   72         case 's':
   73         species = optarg;
   74             break;
   75         case 'd':
   76         dbaccess = optarg;
   77             break;
   78         case 'h':
   79             help = 1;
   80             break;
   81         case 'c':
   82         chunksize = atoi(optarg);
   83             break;
   84         default:
   85             break;
   86         }
   87     }
   88     if (optind < argc-1) {
   89         cerr << "More than one option without name: ";
   90         while (optind < argc)
   91             cerr << " " << argv[optind++];
   92     cerr << endl << "Only the sequence/hints file name does not require a parameter name." << endl;
   93     if (!help)
   94         printUsage();
   95     exit(1);
   96     }
   97     if (optind == argc-1)
   98     filename = argv[optind];
   99     else {
  100     cerr << "Missing sequence/hints file name." << endl;
  101     printUsage();
  102     exit(1);
  103     }
  104     if (help) {
  105     printUsage();
  106     exit(1);
  107     }
  108     if (species.empty()){
  109     cerr << "Missing species name. Required parameter." << endl;
  110     printUsage();
  111     exit(1);
  112     }
  113     if (dbaccess.empty()){
  114     cerr << "Missing database access info. dbaccess is a required parameter." << endl;
  115     printUsage();
  116     exit(1);
  117     }
  118     if (chunksize < 2){
  119     cerr << "Chunksize too small (" << chunksize << "). Should be roughly in the oder of a gene's length." << endl;
  120     printUsage();
  121     exit(1);
  122     }
  123     if (chunksize > 1000000){
  124     cerr << "Chunksize too big (" << chunksize << "). " << endl;
  125     printUsage();
  126     exit(1);
  127     }
  128 
  129     try {
  130     connectDB(dbaccess);
  131     } catch (const char *m) {
  132     cerr << "Database connection error:" << endl << m << endl;
  133     exit(1);
  134     }
  135     try {
  136     createTableSpeciesnames();
  137     createTableSeqnames();
  138 
  139     ifstream ifstrm;
  140     ifstrm.open(filename.c_str());
  141     if( !ifstrm )
  142         throw string("Could not open input file \"") + filename + "\"!";
  143 
  144     /*filtering_istream zin;
  145     try {  
  146         zin.push(gzip_decompressor());
  147         zin.push(ifstrm);
  148         zin.peek();
  149         if (!zin)
  150         throw("Could not read first character assuming gzip format.");
  151         cout << "Looks like " << filename << " is a gzip file. Deflating..." << endl;
  152         } catch (...) { // boost::iostreams::gzip_error& 
  153         // not a gzip file or ill-formatted
  154         zin.reset();
  155         ifstrm.seekg(0);
  156         zin.push(ifstrm);
  157         }*/
  158     
  159     //if(isFasta(zin)){
  160     if(isFasta(ifstrm)){
  161         cout << "Looks like " << filename << " is in fasta format." << endl;
  162         createTableGenomes();
  163         char *sequence = NULL, *name = NULL;
  164         int length = 0, seqCount = 0, chunkCount = 0;
  165         unsigned int lenCount = 0;
  166         //readOneFastaSeq(zin, sequence, name, length);
  167         readOneFastaSeq(ifstrm, sequence, name, length);
  168         int speciesid = getSpeciesID(species);
  169         while (sequence){
  170         chunkCount += insertSeq(sequence, name, length, speciesid);
  171         seqCount++;
  172         lenCount += length;
  173         delete sequence;
  174         delete name;
  175         sequence = name = NULL;
  176         //readOneFastaSeq(zin, sequence, name, length);
  177         readOneFastaSeq(ifstrm, sequence, name, length);
  178         }
  179         if (seqCount > 0)
  180         cout << "Inserted " << chunkCount << " chunks of " << seqCount << " sequences (total length "
  181              << lenCount << " bp)." << endl;
  182         else
  183         cout << "No sequences found. Nothing inserted into database." << endl;
  184         
  185     }
  186     else if(isGFF(ifstrm)){
  187         cout << "Looks like " << filename << " is in gff format." << endl;
  188         createTableHints();
  189         createTableFeatureTypes();
  190         map<string,int> hintCount; // stores for each sequence the number of hints inserted into the database
  191         while(ifstrm){
  192         Feature f;
  193         try{
  194             ifstrm >> f >> comment >> ws;
  195             if(f.type != -1){
  196             insertHint(f,species);
  197             hintCount[f.seqname]++;
  198             }
  199         } catch (ProjectError e){}
  200         }
  201         if(!hintCount.empty()){
  202         cout << "inserted" << endl; 
  203         for(map<string,int>::iterator it= hintCount.begin(); it != hintCount.end(); it++){
  204             cout << it->second << " hints for " << it->first << endl;
  205         }
  206         //removeDuplicates(); dublicastes can only be removed when multiplicity is updated
  207         }
  208         else{
  209         cout << "No hints found. Nothing inserted into database." << endl;
  210         }
  211         checkConsistency();
  212     }
  213     else{
  214         cout << filename << " is neither in gff nor in fasta format." << endl;
  215     }
  216     ifstrm.close();   
  217     } catch( string err ){
  218         cerr << "\n" <<  argv[0] << ": ERROR\n\t" << err << "\n\n";
  219     exit(1);
  220     } 
  221 }
  222 
  223 void printUsage(){
  224     cout << "usage:\n\
  225 load2db [parameters] --species=SPECIES --dbaccess=dbname,host,user,passwd  inputfilename\n\
  226 \n\
  227 inputfilename refers to a genome file in FASTA format or a hints file in GFF format\n\
  228 SPECIES is the same identifier as is used in the treefile and alnfile parameters to augustus.\n\
  229 \n\
  230 dbname,host,user,passwd are the name of the SQL database, the host name or IP, the database user and password\n\
  231 When storing genomes/hints of multiple organisms call this program repeatedly for each one.\n\
  232 A single table with the structure\n\
  233 \
  234 is created.\n\
  235 \n\
  236 parameters:\n\
  237 --help        print this usage info\n\
  238 --chunksize   this option is only relevant when loading a sequence file\n\
  239               the sequences in the input genome are split into chunks of this size so\n\
  240               that subsequent retrievals of small sequence ranges do not require to read\n\
  241               the complete - potentially much longer - chromosome. (<= 1000000, default " << chunksize << ")\n\
  242 \n\
  243 example:\n\
  244      load2db --species=chicken --dbaccess=birds,localhost,mario,dF$n.E chickengenome.fa\n\
  245      load2db --species=chicken --dbaccess=birds,localhost,mario,dF$n.E chickenhints.gff\n\
  246 \n\
  247 Example code for database creation before calling load2db:\n\
  248 mysql -u root -p\n\
  249 create database birds;\n\
  250 select password('dF$n.E');\n\
  251 create user `mario`@`%` identified by password '*72E7F76393830492EB3C58AD730188708BD72DE1'; /* or whatever the password code is*/\n\
  252 grant all privileges on birds.* to mario@'%';\n";
  253 }
  254 
  255 void connectDB(string dbaccess){
  256     string db_name, host, user, passwd;
  257 
  258     string::size_type start=0, end;
  259     end = dbaccess.find(','); // string 'dbaccess' is delimited by ','
  260     if (end == string::npos)
  261     throw ("Database name missing in dbaccess.");
  262     db_name = dbaccess.substr(start, end-start);
  263     start = end + 1;
  264     end = dbaccess.find(',', start);
  265     if (end == string::npos)
  266     throw ("Host name missing in dbaccess.");
  267     host = dbaccess.substr(start, end-start);
  268     start = end + 1;
  269     end = dbaccess.find(',', start);
  270     if (end == string::npos)
  271     throw ("User name missing in dbaccess.");
  272     user = dbaccess.substr(start, end-start);
  273     start = end + 1;
  274     end = dbaccess.find(',', start);
  275     if (end == string::npos)
  276     passwd = dbaccess.substr(start);
  277     else 
  278     passwd = dbaccess.substr(start, end-start); // in case dbaccess also includes the port number
  279     
  280     try {
  281     cout << "Trying to connect to database " << db_name << " on server "
  282          << host << " as user " << user << " using password " << passwd << " ..." << endl;
  283     con.connect(db_name.c_str(), host.c_str(), user.c_str(), passwd.c_str());
  284     } catch(const mysqlpp::BadQuery& e){
  285     cout << "Connection error: " << e.what() << endl;
  286     }
  287 }
  288 
  289 /*
  290  * create table `genomes` if it does not already exist
  291  */
  292 void createTableGenomes(){
  293     mysqlpp::Query query = con.query("CREATE TABLE IF NOT EXISTS genomes (\
  294         seqid int(10) unsigned NOT NULL AUTO_INCREMENT,         \
  295         dnaseq longtext NOT NULL,                   \
  296         seqnr int(10) unsigned,                 \
  297         start int(9) unsigned NOT NULL,\
  298         end int(9) unsigned NOT NULL,\
  299         speciesid int(10) unsigned,\
  300         PRIMARY KEY (seqid),\
  301         KEY region (speciesid,seqnr,start,end),\
  302         FOREIGN KEY (speciesid,seqnr) REFERENCES seqnames(speciesid,seqnr)\
  303       ) ENGINE=MyISAM DEFAULT CHARSET=latin1 MAX_ROWS=10000000 AVG_ROW_LENGTH=50000;");
  304    query.execute();
  305 }
  306 /*
  307  * create table 'speciesnames' if it does not already exist
  308  */
  309 void createTableSpeciesnames(){
  310     mysqlpp::Query query = con.query("CREATE TABLE IF NOT EXISTS speciesnames (\
  311         speciesid int(10) unsigned NOT NULL AUTO_INCREMENT PRIMARY KEY,\
  312         speciesname varchar(50) UNIQUE KEY\
  313       ) ENGINE=MyISAM DEFAULT CHARSET=latin1 MAX_ROWS=1000000 AVG_ROW_LENGTH=50000;");
  314     query.execute();
  315 }
  316 /*                                                                                                                                                                                                                
  317  * create table 'seqnames' if it does not already exist                                                                                                                                                        
  318  */
  319 void createTableSeqnames(){
  320     mysqlpp::Query query = con.query("CREATE TABLE IF NOT EXISTS seqnames (\
  321         seqnr int(10) unsigned AUTO_INCREMENT,\
  322         speciesid int(10) unsigned REFERENCES speciesnames(speciesid),\
  323         seqname varchar(50),\
  324         PRIMARY KEY(speciesid,seqnr),\
  325         UNIQUE KEY(speciesid,seqname)\
  326       ) ENGINE=MyISAM DEFAULT CHARSET=latin1 MAX_ROWS=1000000 AVG_ROW_LENGTH=50000;");
  327     query.execute();
  328 }
  329 /*
  330  * create table 'hints' if it does not already exist
  331  */
  332 void createTableHints(){
  333     mysqlpp::Query query = con.query("CREATE TABLE IF NOT EXISTS hints (\
  334         hintid int(10) unsigned AUTO_INCREMENT PRIMARY KEY,\
  335         speciesid int(10) unsigned,\
  336         seqnr int(10) unsigned,\
  337         source varchar(50),\
  338         start int(9) unsigned NOT NULL,\
  339         end int(9) unsigned NOT NULL,\
  340         score float NOT NULL DEFAULT 0.0,\
  341         type tinyint unsigned NOT NULL,\
  342         strand enum('+','-','.') NOT NULL DEFAULT '.',\
  343         frame enum ('0','1','2','.') NOT NULL DEFAULT '.',\
  344         priority smallint NOT NULL DEFAULT -1,\
  345         grp varchar(100) DEFAULT '',\
  346         mult smallint unsigned DEFAULT 1,\
  347         esource varchar(10) NOT NULL,\
  348         KEY region (speciesid,seqnr,start,end),\
  349         FOREIGN KEY (speciesid,seqnr) REFERENCES seqnames(speciesid,seqnr)\
  350  ) ENGINE=MyISAM DEFAULT CHARSET=latin1 MAX_ROWS=1000000000 AVG_ROW_LENGTH=50000;");
  351     query.execute();
  352 }
  353 
  354 void createTableFeatureTypes(){
  355     mysqlpp::Query query = con.query("CREATE TABLE IF NOT EXISTS featuretypes (\
  356         typeid tinyint unsigned PRIMARY KEY,\
  357         typename enum('start','stop','ass','dss','tss','tts','exonpart','exon','intronpart','intron',\
  358         'irpart','CDS','CDSpart','UTR','UTRpart','nonexonpart','genicpart') NOT NULL\
  359  ) ENGINE=MyISAM DEFAULT CHARSET=latin1 MAX_ROWS=1000000 AVG_ROW_LENGTH=50000;");
  360     query.execute();
  361     for(int i=0; i < 17; i++){
  362     query << "INSERT IGNORE INTO featuretypes VALUES(" << i << "," << (i+1) << ")";
  363     //cout << "Executing" << endl << query.str() << endl;
  364     query.execute();
  365     }
  366 }
  367 
  368 /*
  369  * insert one sequence into database
  370  * return the number of chunks that sequence was cut into
  371  * start and end are 0-based and inclusive.
  372  */
  373 int insertSeq(string sequence, char *name, int length, int speciesid){
  374     int chunks = 0;
  375     int start = 0, end;
  376     int seqnr = insertSeqName(name,speciesid);
  377     while (start < length) {
  378     end = (start + chunksize < length)? (start + chunksize - 1) : length - 1;
  379     mysqlpp::Query query = con.query();
  380     query << "INSERT INTO genomes (dnaseq,seqnr,speciesid,start,end) VALUES(\""
  381           << sequence.substr(start, end-start+1) << "\"," << seqnr << "," 
  382           << speciesid << "," << start << "," << end << ")";
  383     //cout << "Executing" << endl << query.str() << endl;
  384     query.execute();
  385     chunks++;
  386     start += chunksize;
  387     }
  388     return chunks;
  389 }
  390 /*
  391  * insert one hint into database
  392  * start and end are 0-based and inclusive
  393  */
  394 void insertHint(Feature& f, string species){
  395     mysqlpp::Query query = con.query();
  396     int speciesid = getSpeciesID(species);
  397     int seqnr = getSeqNr(&(f.seqname[0]),speciesid);
  398     string attributes = "";
  399     string values = "";
  400     switch (f.strand) {
  401         case plusstrand: attributes+="strand,"; values+=("\"+\","); break;
  402         case minusstrand: attributes+="strand,"; values+=("\"-\","); break;
  403         default : break;
  404     }
  405     if (f.frame != -1){
  406     attributes+="frame,";
  407     values+=("\"" + itoa(f.frame) + "\",");
  408     }
  409     if (f.groupname != ""){
  410     attributes+="grp,";
  411     values+=("\"" + f.groupname + "\",");
  412     }
  413     if (f.mult > 1){
  414     attributes+="mult,";
  415     values+=(itoa(f.mult) + ",");
  416     }
  417     if (f.priority >= 0){
  418     attributes+="priority,";
  419     values+=(itoa(f.priority) + ",");
  420     }
  421     query << "INSERT INTO hints (speciesid,seqnr,source,start,end,score,type," << attributes << "esource) VALUES ("
  422       << speciesid << "," << seqnr <<  ",\"" << f.source << "\"," << f.start << "," << f.end << "," << f.score << ","
  423           << f.type << "," << values << "\""<< f.esource << "\")";
  424     //cout << "Executing" << endl << query.str() << endl;
  425     query.execute();
  426   
  427 }
  428 /*
  429  * returns the speciesid for a given species name
  430  * if the species name is not in the database it is inserted
  431  */
  432 int getSpeciesID(string species){
  433     mysqlpp::Query query = con.query();
  434     query << "SELECT speciesid FROM speciesnames WHERE speciesname='" <<species<<"'";;
  435     //cout << "Executing" << endl << query.str() << endl;
  436     mysqlpp::StoreQueryResult res = query.store();
  437     if(!res.empty()){
  438     return res[0]["speciesid"];
  439     }
  440     else{
  441     // insert species name
  442     query << "INSERT INTO speciesnames (speciesname) VALUES (\"" << species << "\")";
  443     //cout << "Executing" << endl << query.str() << endl;
  444     query.execute();
  445     return query.insert_id();
  446     }
  447 }
  448 /*
  449  * returns the seqnr for a given sequence name and species id
  450  * it the sequence name is not in the database, it will be inserted
  451  */
  452 int getSeqNr(char* seqname,int speciesid){
  453     mysqlpp::Query query = con.query();
  454     query << "SELECT seqnr FROM seqnames WHERE seqname='" << seqname << "' AND speciesid=" << speciesid;
  455     // cout << "Executing" << endl << query.str() << endl;
  456     mysqlpp::StoreQueryResult res = query.store();
  457     if (!res.empty()){
  458         return res[0]["seqnr"];
  459     } else{
  460         query << "INSERT INTO seqnames (speciesid,seqname) VALUES (" << speciesid << ",\"" << seqname << "\")";
  461     // cout << "Executing" << endl << query.str() << endl;
  462         query.execute();
  463         return query.insert_id();
  464     }
  465 }
  466 int insertSeqName(char* seqname,int speciesid){
  467     try{
  468     mysqlpp::Query query = con.query();
  469     query << "INSERT INTO seqnames (speciesid,seqname) VALUES (" << speciesid << ",\"" << seqname << "\")";
  470     //cout << "Executing" << endl << query.str() << endl;
  471     query.execute();
  472     return query.insert_id();
  473     } catch(const mysqlpp::BadQuery& e){
  474         cerr <<"sequence "<< seqname <<" already exists in database" << endl;
  475     cerr <<"Please delete sequence, before reloading"<<endl;
  476     exit(1);
  477     }
  478 }
  479 
  480 void checkConsistency(){
  481     mysqlpp::Query query = con.query("SELECT DISTINCT speciesname,seqname FROM hints as h,\
  482         speciesnames as s,seqnames as n where h.speciesid=s.speciesid AND\
  483         s.speciesid=n.speciesid AND h.seqnr=n.seqnr AND NOT EXISTS\
  484         (SELECT DISTINCT speciesid,seqnr FROM genomes as g WHERE g.speciesid=h.speciesid AND g.seqnr=h.seqnr);");
  485     mysqlpp::StoreQueryResult res = query.store();
  486     if(!res.empty()){
  487     cout << "WARNING: the following sequences are not in the database although we have hints for them" << endl;
  488     for(size_t i=0;i<res.num_rows();i++){
  489         cerr<<res[i]["seqname"]<<" (species "<<res[i]["speciesname"]<<")"<<endl;
  490     }
  491     }
  492 }
  493 void removeDuplicates(){
  494     cout << "removing duplicate hints"<< endl;
  495     mysqlpp::Query query = con.query(" ALTER IGNORE TABLE hints ADD UNIQUE\
  496         (speciesid,seqnr,source,start,end,score,type,strand,frame, priority,grp,mult,esource);");
  497     mysqlpp::SimpleResult res = query.execute();
  498     cout << res.info() << endl;
  499 }