// // the original source is outline.c by Clark Cooper. // #include #include #include #include #include #include using namespace std; #define BUFFSIZE 8192 char Buff[BUFFSIZE]; struct Tag { string name; string str; map attributes; Tag(const string &n) : name(n) {} }; struct Parseinfo { enum TagType { OPEN, CLOSE }; int skip; int depth; string cdata; vector st; TagType last_tagtype; Parseinfo() { skip = 0; depth = 1; last_tagtype = CLOSE; } }; vector sentence_segmentation(const string & abs) { vector vs; int begin = 0; for (int i = 0; ; i++) { if (i == abs.size() || (i > 1 && abs[i-2] == '.' && abs[i-1] == ' ' && (isupper(abs[i]) || abs[i] == ' ') ) ) { vs.push_back(abs.substr(begin, i - begin)); begin = i; } if (i == abs.size()) break; } return vs; } static void print_cdata(const string& s) { for (int i = 0; i < s.size(); i++) { if (s[i] == '<') { cout << "<"; continue; } if (s[i] == '>') { cout << ">"; continue; } if (s[i] == '&') { cout << "&"; continue; } cout << s[i]; } } static void process_cdata(Parseinfo* inf, bool tag_start) { string s(inf->cdata); vector& st = inf->st; for (int i = 1; i < st.size(); i++) { Tag &t = st[i]; t.str += s; } return; } static void start(void *data, const char *el, const char **attr) { Parseinfo *inf = (Parseinfo *) data; Tag t(el); for (int i = 0; attr[i]; i += 2) { t.attributes[attr[i]] = attr[i+1]; } process_cdata(inf, true); inf->st.push_back(t); inf->depth++; inf->cdata = ""; inf->last_tagtype = Parseinfo::OPEN; } static void end(void *data, const char *el) { Parseinfo *inf = (Parseinfo *) data; process_cdata(inf, false); Tag& t = inf->st.back(); static string pmid; if (t.name == "PMID") { pmid = t.str; } if (t.name == "AbstractText") { vector vs(sentence_segmentation(t.str)); for (vector::const_iterator i = vs.begin(); i != vs.end(); i++) { cout << pmid << "\tA\t" << *i << endl; } } if (t.name == "ArticleTitle") { vector vs(sentence_segmentation(t.str)); for (vector::const_iterator i = vs.begin(); i != vs.end(); i++) { cout << pmid << "\tT\t" << *i << endl; } } /* if (t.attributes.find("sem") != t.attributes.end()) { cout << t.str << "\t" << t.attributes["sem"] << endl; if (t.attributes.find("lex") != t.attributes.end()) { if ((t.attributes["lex"])[0] == '*') cerr << "warning: \"" << t.str << " " << t.attributes["sem"] << "\" " << "looks like a fragment while it has a sem-value." << endl; } } */ inf->st.pop_back(); inf->depth--; inf->cdata = ""; inf->last_tagtype = Parseinfo::CLOSE; } static void chardata(void *data, const char *str, int len) { Parseinfo *inf = (Parseinfo *) data; string s(str, len); // Tag& t = inf->st.top(); inf->cdata += s; // cout << t.attributes["sem"] << " " << len << " " << s << endl; } int main(int argc, char *argv[]) { // cout << ""; XML_Parser p = XML_ParserCreate(NULL); if (! p) { fprintf(stderr, "Couldn't allocate memory for parser\n"); exit(-1); } Parseinfo inf; XML_SetUserData(p, &inf); XML_SetElementHandler(p, start, end); XML_SetCharacterDataHandler(p, chardata); for (;;) { int done; int len; len = fread(Buff, 1, BUFFSIZE, stdin); if (ferror(stdin)) { fprintf(stderr, "Read error\n"); exit(-1); } done = feof(stdin); if (XML_Parse(p, Buff, len, done) == 0) { fprintf(stderr, "Parse error at line %d:\n%s\n", XML_GetCurrentLineNumber(p), XML_ErrorString(XML_GetErrorCode(p))); exit(-1); } if (done) break; } return 0; }