00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifndef BTK_PDB_FILE_PARSER_HPP
00022 #define BTK_PDB_FILE_PARSER_HPP
00023
00024 #include <ios>
00025 #include <sstream>
00026 #include <fstream>
00027 #include <cctype>
00028 #include <string>
00029 #include <algorithm>
00030
00031 #include <btk/core/math/btk_vector.hpp>
00032 #include <btk/core/common/exceptions.hpp>
00033 #include <btk/core/io/logging.hpp>
00034 #include <btk/core/io/default_type_system.hpp>
00035
00036 namespace BTK {
00037 namespace EXCEPTIONS {
00038
00039
00040
00041
00042 NEW_BTK_EXCEPTION_TYPE(BTKFileOpenFailure,BTKRuntimeError);
00043 NEW_BTK_EXCEPTION_TYPE(BTKFileReadFailure,BTKRuntimeError);
00044 NEW_BTK_EXCEPTION_TYPE(BTKCorruptPDBFile,BTKRuntimeError);
00045
00046 }
00047
00048 namespace IO {
00049
00050 using BTK::EXCEPTIONS::BTKInvalidArgument;
00051 using BTK::EXCEPTIONS::BTKFileOpenFailure;
00052 using BTK::EXCEPTIONS::BTKFileReadFailure;
00053 using BTK::EXCEPTIONS::BTKCorruptPDBFile;
00054
00055 template <typename TypeSystem = DefaultTypeSystem>
00056 class PDBFileParser
00057 {
00058 public:
00059 typedef TypeSystem type_system;
00060
00061 typedef typename type_system::element_dictionary element_dictionary;
00062 typedef typename type_system::atom_dictionary atom_dictionary;
00063 typedef typename type_system::monomer_dictionary monomer_dictionary;
00064
00065 typedef typename element_dictionary::id_type element_id_type;
00066 typedef typename atom_dictionary::id_type atom_id_type;
00067 typedef typename monomer_dictionary::id_type monomer_id_type;
00068
00071 PDBFileParser(type_system const & ts = type_system(),
00072 bool use_dynamic_types = true) :
00073 _dictionary(ts), _dynamic_types(true) {}
00074
00084 PDBFileParser(std::string const & filename,
00085 int which_model = 0,
00086 type_system const & ts = type_system(),
00087 bool use_dynamic_types = true) :
00088 _filename(filename), _cur_model(0), _dictionary(ts),
00089 _dynamic_types(use_dynamic_types)
00090 {
00091 if (!open(filename,which_model)) {
00092 BTK_THROW_MSG(BTKFileReadFailure,
00093 "Couldn't find structure in PDB file");
00094 }
00095 }
00096
00100 PDBFileParser(PDBFileParser const & src) :
00101 _filename(src._filename), _header(src._header), _cur_model(src._cur_model),
00102 _dictionary(src._dictionary), _dynamic_types(src._dynamic_types)
00103 {
00104 open_file();
00105
00106 if (_cur_model > 0 && !find_model(_cur_model)) {
00107 BTK_THROW_MSG(BTKFileReadFailure,
00108 "Couldn't find requested model in PDB file");
00109 }
00110 }
00111
00112 virtual ~PDBFileParser() { close(); }
00113
00115 bool open(std::string filename,
00116 int which_model = 0);
00117
00119 void close()
00120 {
00121 _pdb_fs.close();
00122 _pdb_fs.clear();
00123 _cur_model = 0;
00124 _cur_line = "";
00125 }
00126
00128 std::string const & get_header() const { return _header; }
00129
00131 std::string const & get_filename() const { return _filename; }
00132
00134 template <typename PolymerSystemType>
00135 bool get_model(int which,
00136 PolymerSystemType & model_out,
00137 bool load_hetatoms = true);
00138
00140 template <typename PolymerSystemType>
00141 bool get_current_model(PolymerSystemType & model_out,
00142 bool load_hetatoms = true)
00143 {
00144 return get_model(_cur_model,model_out,load_hetatoms);
00145 }
00146
00148 template <typename PolymerSystemType>
00149 bool get_next_model(PolymerSystemType & model_out,
00150 bool load_hetatoms = true)
00151 {
00152 return get_model(_cur_model + 1,model_out,load_hetatoms);
00153 }
00154
00156 type_system dictionary() {return _dictionary;}
00157 type_system const dictionary() const { return _dictionary; }
00158
00159 protected:
00160
00161 std::string eat_whitespace(std::string const & s) const;
00162
00163 void open_file();
00164 void rewind_file();
00165 void save_header();
00166 bool find_model(int which);
00167 bool find_next_atom(bool find_hetatoms);
00168
00169 template <typename AtomType>
00170 AtomType parse_atom(std::string const & atom_record,
00171 bool is_hetatom = false);
00172
00173
00174 template <typename PolymerSystemType>
00175 void parse_current_model(PolymerSystemType & model_out,
00176 bool load_hetatoms);
00177
00178 template <typename AtomIterator, typename PolymerSystemType>
00179 void save_hetatoms(AtomIterator het_begin, AtomIterator het_end,
00180 PolymerSystemType & model_out);
00181
00182 private:
00183 std::string _filename;
00184 std::ifstream _pdb_fs;
00185 std::string _header;
00186 int _cur_model;
00187 std::string _cur_line;
00188
00189 type_system _dictionary;
00190 const bool _dynamic_types;
00191 };
00192
00193 }
00194 }
00195
00197
00199
00200 template <typename TypeSystem>
00201 template <typename PolymerSystemType>
00202 bool
00203 BTK::IO::PDBFileParser<TypeSystem>::
00204 get_model(int which,
00205 PolymerSystemType & model_out,
00206 bool load_hetatoms)
00207 {
00208 if (find_model(which)) {
00209 parse_current_model(model_out,load_hetatoms);
00210 return true;
00211 } else
00212 return false;
00213 }
00214
00215 template <typename TypeSystem>
00216 bool
00217 BTK::IO::PDBFileParser<TypeSystem>::
00218 open(std::string filename,
00219 int which_model)
00220 {
00221 close();
00222
00223 open_file();
00224 save_header();
00225
00226 if (!_pdb_fs) {
00227 BTK_THROW_MSG(BTKFileReadFailure,"Couldn't read past PDB header");
00228 }
00229
00230
00231 if (_cur_line.substr(0,5) == "MODEL") {
00232 _cur_model = std::atoi(_cur_line.substr(10).c_str());
00233 }
00234
00235
00236
00237 if (which_model > 0 && _cur_model != which_model) {
00238 if (!find_model(which_model)) return false;
00239 }
00240
00241 return true;
00242 }
00243
00244 template <typename TypeSystem>
00245 std::string
00246 BTK::IO::PDBFileParser<TypeSystem>::
00247 eat_whitespace(std::string const & s) const
00248 {
00249 std::string tmp(s);
00250
00251 tmp.erase(std::remove_if(tmp.begin(),tmp.end(),
00252 static_cast<int (*)(int)>(std::isspace)),
00253 tmp.end());
00254
00255 return tmp;
00256 }
00257
00258 template <typename TypeSystem>
00259 void
00260 BTK::IO::PDBFileParser<TypeSystem>::
00261 open_file()
00262 {
00263 if (!_filename.empty()) {
00264 _pdb_fs.open(_filename.c_str());
00265
00266 if (_pdb_fs.fail()) {
00267 BTK_THROW(BTKFileOpenFailure);
00268 }
00269 } else {
00270 BTK_THROW_MSG(BTKInvalidArgument,
00271 "Empty filename passed to PDBFileParser");
00272 }
00273
00274 _cur_model = 0;
00275 _cur_line = "";
00276 }
00277
00278 template <typename TypeSystem>
00279 void
00280 BTK::IO::PDBFileParser<TypeSystem>::
00281 rewind_file()
00282 {
00283 _pdb_fs.clear();
00284 _pdb_fs.seekg(0,std::ios::beg);
00285 if (!_pdb_fs) {
00286 BTK_THROW_MSG(BTKFileReadFailure,
00287 "Rewind seek failed in rewind_file()");
00288 }
00289 _cur_model = 0;
00290 _cur_line = "";
00291 }
00292
00293 template <typename TypeSystem>
00294 void
00295 BTK::IO::PDBFileParser<TypeSystem>::
00296 save_header()
00297 {
00298
00299
00300
00301
00302
00303
00304 rewind_file();
00305
00306 if (!_pdb_fs) {
00307 BTK_THROW_MSG(BTKFileReadFailure,
00308 "Bad stream in parse_header()");
00309 }
00310
00311 _header.clear();
00312
00313 while (getline(_pdb_fs,_cur_line)) {
00314 if (_cur_line.substr(0,4) == "ATOM" ||
00315 _cur_line.substr(0,6) == "HETATM" ||
00316 _cur_line.substr(0,5) == "MODEL") return;
00317
00318 _header += _cur_line;
00319 _header += '\n';
00320 }
00321
00322 if (!_pdb_fs) {
00323 BTK_THROW_MSG(BTKFileReadFailure,
00324 "Reached EOF in parse_header()");
00325 }
00326 }
00327
00328 template <typename TypeSystem>
00329 bool
00330 BTK::IO::PDBFileParser<TypeSystem>::
00331 find_model(int which)
00332 {
00333
00334
00335
00336 if (_pdb_fs.good()) {
00337
00338 if (which == 0) {
00339 rewind_file();
00340 return find_next_atom(false);
00341 } else if (which <= _cur_model) {
00342 rewind_file();
00343 }
00344 } else if (_pdb_fs.eof()) {
00345
00346 rewind_file();
00347 } else {
00348
00349 BTK_THROW_MSG(BTKFileReadFailure,
00350 "Bad stream in find_model()");
00351 }
00352
00353
00354
00355 while (_pdb_fs.good()) {
00356
00357 do {
00358 if (_cur_line.substr(0,5) == "MODEL") break;
00359 } while (getline(_pdb_fs,_cur_line));
00360
00361 if (_pdb_fs.eof()) {
00362
00363 return false;
00364 } else if (_pdb_fs.fail()) {
00365
00366 BTK_THROW(BTKFileReadFailure);
00367 } else {
00368
00369 int num = std::atoi(_cur_line.substr(10).c_str());
00370
00371 if (which == 0 || num == which) {
00372 _cur_model = num;
00373 return true;
00374 }
00375 }
00376 }
00377
00378
00379 return false;
00380 }
00381
00382 template <typename TypeSystem>
00383 bool
00384 BTK::IO::PDBFileParser<TypeSystem>::
00385 find_next_atom(bool find_hetatoms)
00386 {
00387 while (_cur_line.substr(0,4) != "ATOM" &&
00388 (!find_hetatoms || _cur_line.substr(0,6) != "HETATM"))
00389 {
00390 getline(_pdb_fs,_cur_line);
00391 if (!_pdb_fs) return false;
00392 }
00393
00394 return true;
00395 }
00396
00397 template <typename TypeSystem>
00398 template <typename AtomType>
00399 AtomType
00400 BTK::IO::PDBFileParser<TypeSystem>::
00401 parse_atom(std::string const & atom_record,
00402 bool is_hetatom)
00403 {
00404 using std::atoi;
00405 using std::atof;
00406 using std::string;
00407
00408 int atom_num = atoi(atom_record.substr(6,5).c_str());
00409 int res_num = atoi(atom_record.substr(22,4).c_str());
00410 string atom_name = eat_whitespace(atom_record.substr(12,4));
00411 string res_name = eat_whitespace(atom_record.substr(17,3));
00412 char alt_loc = atom_record[16];
00413 char chain_id = atom_record[21];
00414 char insert_code = atom_record[26];
00415 BTK::MATH::BTKVector position(atof(atom_record.substr(30,8).c_str()),
00416 atof(atom_record.substr(38,8).c_str()),
00417 atof(atom_record.substr(46,8).c_str()));
00418
00419 monomer_id_type res_type =
00420 _dictionary.get_monomer_dictionary().get_type(res_name);
00421
00422 if (res_type == BTK::UTILITY::TypeIDTraits<monomer_id_type>::unknown()) {
00423 std::ostringstream msg;
00424 msg << "Warning: unknown residue name " << res_name << " in PDB file.";
00425 BTK::IO::LOGGING::log_msg(msg.str(),
00426 BTK::IO::LOGGING::IO & BTK::IO::LOGGING::WARNING);
00427 }
00428
00429 atom_id_type atom_type =
00430 _dictionary.get_atom_dictionary().get_type(atom_name);
00431
00432 if (atom_type == BTK::UTILITY::TypeIDTraits<atom_id_type>::unknown()) {
00433 std::ostringstream msg;
00434 msg << "Warning: unknown atom name " << atom_name << " in PDB file.";
00435 BTK::IO::LOGGING::log_msg(msg.str(),
00436 BTK::IO::LOGGING::IO & BTK::IO::LOGGING::WARNING);
00437 }
00438
00439
00440
00441
00442 double occupancy(0), b_factor(0);
00443
00444 if (atom_record.size() >= 60) {
00445 occupancy = atof(atom_record.substr(54,6).c_str());
00446 }
00447
00448 if (atom_record.size() >= 66) {
00449 b_factor = atof(atom_record.substr(60,6).c_str());
00450 }
00451
00452 string segment_id;
00453
00454 if (atom_record.size() >= 76) {
00455 segment_id = eat_whitespace(atom_record.substr(72,4));
00456 }
00457
00458 string element_name;
00459 element_id_type element_type;
00460
00461 if (atom_record.size() >= 78 &&
00462 (std::isalpha(atom_record[76]) || std::isalpha(atom_record[77]))) {
00463
00464 element_name = eat_whitespace(atom_record.substr(76,2));
00465 element_type = _dictionary.get_element_dictionary().get_type(element_name);
00466 } else {
00467
00468
00469
00470
00471
00472 if (atom_name == "ZN" || atom_name == "MG" || atom_name == "SE") {
00473 element_name = atom_name;
00474 element_type = _dictionary.get_element_dictionary().get_type(atom_name);
00475 } else if (is_hetatom && atom_name == "CA") {
00476
00477
00478
00479 element_name = atom_name;
00480 element_type = _dictionary.get_element_dictionary().get_type(atom_name);
00481 } else {
00482
00483 unsigned i = atom_name.find_first_of("NCHOSP");
00484 element_name = atom_name.substr(i,1);
00485
00486 if (i != std::string::npos) {
00487 element_type = _dictionary.get_element_dictionary().get_type(element_name);
00488 }
00489 }
00490
00491 if (element_type == BTK::UTILITY::TypeIDTraits<element_id_type>::unknown()) {
00492 std::ostringstream msg;
00493 msg << "Warning: uknown element name " << element_name << " found in PDB file.";
00494 BTK::IO::LOGGING::log_msg(msg.str(),
00495 BTK::IO::LOGGING::IO &
00496 BTK::IO::LOGGING::WARNING);
00497 }
00498 }
00499
00500 string charge;
00501
00502 if (atom_record.size() >= 80) {
00503 charge = eat_whitespace(atom_record.substr(78,2));
00504 }
00505
00506 return AtomType(dictionary(),
00507 is_hetatom,
00508 atom_num,
00509 atom_type,
00510 alt_loc,
00511 res_type,
00512 chain_id,
00513 res_num,
00514 insert_code,
00515 position,
00516 occupancy,
00517 b_factor,
00518 segment_id,
00519 element_type,
00520 charge);
00521 }
00522
00523
00524 template <typename TypeSystem>
00525 template <typename PolymerSystemType>
00526 void
00527 BTK::IO::PDBFileParser<TypeSystem>::
00528 parse_current_model(PolymerSystemType & model_out,
00529 bool load_hetatoms)
00530 {
00531 typedef typename PolymerSystemType::atom_type AtomType;
00532 typedef typename PolymerSystemType::monomer_type MonomerType;
00533 typedef typename PolymerSystemType::chain_type PolymerType;
00534 typedef typename PolymerSystemType::chain_iterator ChainIterator;
00535
00536 std::vector<AtomType> tmp_hetatoms, tmp_atoms;
00537 bool must_find_endmdl = false;
00538 bool found_endmdl = false;
00539
00540
00541
00542 {
00543 BTK::IO::LOGGING::Logger::logger_ptr
00544 tmp(new BTK::IO::LOGGING::
00545 CachedLogger(BTK::IO::LOGGING::
00546 Logger::instance()));
00547
00548 BTK::IO::LOGGING::Logger::push_logger(tmp);
00549 }
00550
00551
00552 model_out.clear();
00553
00554
00555
00556 if (_cur_line.substr(0,5) == "MODEL") {
00557 _cur_model = std::atoi(_cur_line.substr(10).c_str());
00558 must_find_endmdl = true;
00559 }
00560
00561
00562 if (!find_next_atom(load_hetatoms)) {
00563 BTK_THROW_MSG(BTKFileReadFailure,
00564 "Couldn't find first atom in PDB model");
00565 }
00566
00567
00568 bool found_ter = false;
00569 char prev_chain_id = _cur_line[21];
00570 std::string prev_res_name = eat_whitespace(_cur_line.substr(17,3));
00571 unsigned prev_res_num = std::atoi(_cur_line.substr(22,4).c_str());
00572
00573
00574 ChainIterator cur_chain =
00575 model_out.insert(model_out.system_begin(),PolymerType());
00576 cur_chain->set_chain_id(_cur_line[21]);
00577
00578
00579
00580 do {
00581 if (_cur_line.substr(0,4) == "ATOM") {
00582 bool add_new_residue = false;
00583 bool add_new_chain = false;
00584
00585
00586 std::string res_name = eat_whitespace(_cur_line.substr(17,3));
00587 unsigned res_num = std::atoi(_cur_line.substr(22,4).c_str());
00588
00589 if (_cur_line[21] == prev_chain_id) {
00590
00591 if (res_num != prev_res_num)
00592 add_new_residue = true;
00593 else if (res_name != prev_res_name) {
00594
00595 std::ostringstream msg;
00596 msg << "PDB res name change w/o res num change in "
00597 << _cur_line.substr(0,11) << " (cur name="
00598 << res_name << ", prev name=" << prev_res_name << ")";
00599 BTK_THROW_MSG(BTKCorruptPDBFile,msg.str().c_str());
00600 }
00601
00602 } else {
00603
00604 if (!found_ter) {
00605 std::ostringstream msg;
00606 msg << "Chain id changed from "
00607 << prev_chain_id << " to " << _cur_line[21]
00608 << " w/o a TER record.\n";
00609 using namespace BTK::IO::LOGGING;
00610 Logger::instance().log(msg.str(),WARNING & IO);
00611 }
00612
00613 add_new_residue = true;
00614 add_new_chain = true;
00615 }
00616
00617
00618 if (add_new_residue && tmp_atoms.size()) {
00619 cur_chain->insert(cur_chain->polymer_end(),
00620 MonomerType(tmp_atoms.begin(),
00621 tmp_atoms.end(),
00622 tmp_atoms.begin()->res_type(),
00623 tmp_atoms.begin()->res_number()));
00624
00625 tmp_atoms.clear();
00626 prev_res_name = res_name;
00627 prev_res_num = res_num;
00628 }
00629
00630
00631 if (add_new_chain) {
00632 cur_chain = model_out.insert(model_out.system_end(),PolymerType());
00633 cur_chain->set_chain_id(_cur_line[21]);
00634 prev_chain_id = _cur_line[21];
00635 found_ter = false;
00636 }
00637
00638
00639 tmp_atoms.push_back(parse_atom<AtomType>(_cur_line));
00640
00641 } else if (_cur_line.substr(0,6) == "HETATM") {
00642
00643 if (!load_hetatoms) continue;
00644
00645 tmp_hetatoms.push_back(parse_atom<AtomType>(_cur_line,true));
00646
00647 } else if (_cur_line.substr(0,3) == "TER") {
00648
00649 found_ter = true;
00650
00651 } else if (_cur_line.substr(0,6) == "ENDMDL") {
00652
00653 found_endmdl = true;
00654 break;
00655 } else if (_cur_line.substr(0,3) == "END") {
00656
00657 } else if (_cur_line.substr(0,6) == "CONECT") {
00658
00659 } else if (_cur_line.size() > 0) {
00660 std::ostringstream msg;
00661 msg << "Unknown record type in pdb file : "
00662 << _cur_line.substr(0,6) << '\n';
00663
00664 using namespace BTK::IO::LOGGING;
00665 Logger::instance().log(msg.str(),WARNING & IO);
00666 }
00667 } while (getline(_pdb_fs,_cur_line));
00668
00669
00670 if (tmp_atoms.size()) {
00671 cur_chain->insert(cur_chain->polymer_end(),
00672 MonomerType(tmp_atoms.begin(),
00673 tmp_atoms.end(),
00674 tmp_atoms.begin()->res_type(),
00675 tmp_atoms.begin()->res_number()));
00676 tmp_atoms.clear();
00677 }
00678
00679 if (must_find_endmdl && !found_endmdl) {
00680 using namespace BTK::IO::LOGGING;
00681 Logger::instance().log("Reached end of PDB before finding ENDMDL record",
00682 WARNING & IO);
00683 }
00684
00685 if (load_hetatoms) save_hetatoms(tmp_hetatoms.begin(),
00686 tmp_hetatoms.end(),
00687 model_out);
00688
00689
00690 BTK::IO::LOGGING::Logger::pop_logger();
00691 }
00692
00693 struct hetatom_compare {
00694 template <typename AtomType>
00695 bool operator()(AtomType const & a1,
00696 AtomType const & a2) const
00697 {
00698 if (a1.chain_id() < a2.chain_id()) return true;
00699 else if (a1.chain_id() == a2.chain_id()) {
00700 if (a1.res_number() < a2.res_number()) return true;
00701 else return false;
00702 } else return false;
00703 }
00704 };
00705
00706 template <typename TypeSystem>
00707 template <typename AtomIterator, typename PolymerSystemType>
00708 void
00709 BTK::IO::PDBFileParser<TypeSystem>::
00710 save_hetatoms(AtomIterator het_begin, AtomIterator het_end,
00711 PolymerSystemType & model_out)
00712 {
00713 typedef typename PolymerSystemType::chain_type PolymerType;
00714 typedef typename PolymerSystemType::monomer_type MonomerType;
00715 typedef typename PolymerSystemType::chain_iterator ChainIterator;
00716
00717 if (het_begin == het_end) return;
00718
00719
00720 std::stable_sort(het_begin,
00721 het_end,
00722 hetatom_compare());
00723
00724 AtomIterator i, j, end = het_end;
00725 int prev_res_num;
00726 char prev_chain_id;
00727
00728
00729 ChainIterator cur_chain = model_out.insert(model_out.system_end(),
00730 PolymerType());
00731 cur_chain->set_chain_id(het_begin->chain_id());
00732
00733 prev_chain_id = het_begin->chain_id();
00734 prev_res_num = het_begin->res_number();
00735
00736
00737 for (i = het_begin, j = het_begin; j != end; ++j) {
00738
00739
00740 if (j->res_number() != prev_res_num) {
00741 cur_chain->insert(cur_chain->polymer_end(),
00742 MonomerType(i,j,i->res_type(),
00743 i->res_number()));
00744
00745 prev_res_num = j->res_number();
00746 i = j;
00747 }
00748
00749
00750 if (j->chain_id() != prev_chain_id) {
00751 cur_chain = model_out.insert(model_out.system_end(),PolymerType());
00752 cur_chain->set_chain_id(j->chain_id());
00753
00754 prev_chain_id = j->chain_id();
00755 i = j;
00756 }
00757 }
00758
00759
00760 if (i != j) cur_chain->insert(cur_chain->polymer_end(),
00761 MonomerType(i,j,i->res_type(),
00762 i->res_number()));
00763 }
00764
00765 #endif // BTK_PDB_FILE_PARSER_HPP