/* * USAGE * java CombineMKWithACE.java * The three arguments are as follows: * - PATH_TO_ACE - This is the path to top level directory of the ACE 2005 distribution. For example "/Users/paul/ace_2005_td_v7" * NOTE: The structure of this directory is expected to be exactly in the format provided by the LDC. * Specifically, it is expected that there will be a "dtd" directory, containing the file "apf.v5.1.1.dtd", * and a data directory, containing an English subdirectory. Within "English", there should be the subdirectories * bn, bc, nw, cts, wl and un, and within each of these, there should be a "timex2norm" subdirectory. * * - PATH_TO_MK_DATA - This is the path to the downloaded and unpacked ACE-MK directory, containing the meta-knowledge XML files, * for example "/Users/paul/ACE-MK". It is expected that the directory structure within ACE-MK will not have been altered from the * download provided. * * - PATH_TO_OUTPUT_DIRECTORY - This is the path to the directory where it is desired that the integrated XML files resulting from the * application of the program will be stored. For example, "/Users/paul/COMBINED-ACE-MK". * NOTE: The output directory may be an existing directory, otherwise, the program will create it. * * As a result of running the program, sub-directories corresponding to each part of the corpus (i.e., bn, bc, nw, cts, wl and un) * will be created, if they do not exist already). Within each of these sub-directories, an XML file with the extension ".apf.mk.xml" * will be created for each of the original XML annotation files, which combines the original annotation information with * the meta-knowledge annotation information. * * If you use this code, please do the following * - Attribute the National Centre for Text Mining, School of Computer Science, University of Manchester, UK * - Cite the following paper: Thompson, P., Nawaz, R., McNaught, J. and Ananiadou, S. (2016). Enriching News Events with Meta-knowledge Information. Language Resources and Evaluation. * DOI: 10.1007/s10579-016-9344-9< */ import java.io.File; import java.io.IOException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.w3c.dom.*; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.SAXException; public class CombineMKWithACE { DocumentBuilder builder; int totalEvents, totalEventMentions, totalFiles, totalMultiMentionEvents; int polarityProbEvents, modalityProbEvents, genericityProbEvents, subjectivityProbEvents, sourceTypeProbEvents, tenseProbEvents; public String PATH_TO_ACE; public String PATH_TO_MK_DATA; public String PATH_TO_OUTPUT_DIR; public CombineMKWithACE(String acePath, String mkPath, String outputPath) throws ParserConfigurationException { PATH_TO_ACE = acePath; PATH_TO_MK_DATA = mkPath; PATH_TO_OUTPUT_DIR = outputPath; final File ACEDir = new File(PATH_TO_ACE); final File MKDir = new File(PATH_TO_MK_DATA); File dir; DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setValidating(true); builder = factory.newDocumentBuilder(); builder.setEntityResolver(new EntityResolver() { @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { if (systemId.endsWith("add.dtd")) { return new InputSource( MKDir.getAbsolutePath() + File.separator + "dtd" + File.separator + "add.dtd"); } else if (systemId.endsWith("apf.v5.1.1.dtd")) { return new InputSource( ACEDir.getAbsolutePath() + File.separator + "dtd" + File.separator + "apf.v5.1.1.dtd"); } else { throw new SAXException("unable to resolve entity; " + "public = \"" + publicId + "\", " + "system = \"" + systemId + "\""); } } }); File outputDir = new File(PATH_TO_OUTPUT_DIR); if (!outputDir.exists()) { outputDir.mkdirs(); } dir = new File(PATH_TO_OUTPUT_DIR + File.separator + "bn"); if (!dir.exists()) { dir.mkdirs(); } dir = new File(PATH_TO_OUTPUT_DIR + File.separator + "bc"); if (!dir.exists()) { dir.mkdirs(); } dir = new File(PATH_TO_OUTPUT_DIR + File.separator + "cts"); if (!dir.exists()) { dir.mkdirs(); } dir = new File(PATH_TO_OUTPUT_DIR + File.separator + "nw"); if (!dir.exists()) { dir.mkdirs(); } dir = new File(PATH_TO_OUTPUT_DIR + File.separator + "un"); if (!dir.exists()) { dir.mkdirs(); } dir = new File(PATH_TO_OUTPUT_DIR + File.separator + "wl"); if (!dir.exists()) { dir.mkdirs(); } } public void combineDirectories() { try { File ACEDir = new File(PATH_TO_ACE + File.separator + "data" + File.separator + "English"); File[] ACESubdirs = ACEDir.listFiles(); File[] ACETypeSubdirs; File[] ACEAnnFiles; String fileType, ACEFileName, MKFileName; File ACEFile, MKFile; for (int i = 0; i < ACESubdirs.length; i++) { if (ACESubdirs[i].isDirectory()) { fileType = ACESubdirs[i].getName(); System.out.println(fileType); ACETypeSubdirs = ACESubdirs[i].listFiles(); for (int j = 0; j < ACETypeSubdirs.length; j++) { if (ACETypeSubdirs[j].getName().equals("timex2norm")) { ACEAnnFiles = ACETypeSubdirs[j].listFiles(); for (int k = 0; k < ACEAnnFiles.length; k++) { if (ACEAnnFiles[k].getName().endsWith( ".apf.xml")) { ACEFile = ACEAnnFiles[k]; System.out.println(ACEFile); ACEFileName = ACEFile.getName(); MKFileName = ACEFileName.substring(0, ACEFileName.length() - 4) + ".add.xml"; MKFile = new File(PATH_TO_MK_DATA + File.separator + fileType + File.separator + MKFileName); combineFiles(ACEFile, MKFile, fileType); } } } } } } } catch (Exception e) { e.printStackTrace(); } } private void combineFiles(File ACEFile, File MKFile, String fileType) { try { String ACEFileName = ACEFile.getName(); String outputFilePath = PATH_TO_OUTPUT_DIR + File.separator + fileType + File.separator + ACEFileName.substring(0, ACEFileName.length() - 4) + ".mk.xml"; Document ACEDoc = builder.parse(ACEFile); Document MKDoc = builder.parse(MKFile); NodeList docList = ACEDoc.getElementsByTagName("document"); Element docEl = (Element) docList.item(0); NodeList MKCueList = MKDoc.getElementsByTagName("mk-cue"); NodeList MKSourceList = MKDoc.getElementsByTagName("mk-source"); String ACEEvMentionID, MKEvMentionID; Element ACEEvMention, MKEvMention; Attr MKAttribute; boolean foundMKEvMention; int index; for (int i = 0; i < MKCueList.getLength(); i++) { Element cueEl = (Element) MKCueList.item(i); Node cueCopy = ACEDoc.importNode(cueEl, true); docEl.appendChild(cueCopy); } for (int i = 0; i < MKSourceList.getLength(); i++) { Element sourceEl = (Element) MKSourceList.item(i); Node sourceCopy = ACEDoc.importNode(sourceEl, true); docEl.appendChild(sourceCopy); } NodeList MKEvMentionList = MKDoc .getElementsByTagName("event_mention"); NodeList ACEEvMentionList = ACEDoc .getElementsByTagName("event_mention"); for (int i = 0; i < ACEEvMentionList.getLength(); i++) { ACEEvMention = (Element) ACEEvMentionList.item(i); ACEEvMentionID = ACEEvMention.getAttribute("ID"); foundMKEvMention = false; MKEvMention = null; index = 0; while (!foundMKEvMention && index < MKEvMentionList.getLength()) { MKEvMention = (Element) MKEvMentionList.item(index); MKEvMentionID = MKEvMention.getAttribute("ID"); if (MKEvMentionID.equals(ACEEvMentionID)) { foundMKEvMention = true; } else { index++; } } if (MKEvMention != null) { ACEEvMention.setAttribute("MK-GENERICITY", MKEvMention.getAttribute("MK-GENERICITY")); ACEEvMention.setAttribute("MK-MODALITY", MKEvMention.getAttribute("MK-MODALITY")); ACEEvMention.setAttribute("MK-SOURCE-TYPE", MKEvMention.getAttribute("MK-SOURCE-TYPE")); ACEEvMention.setAttribute("MK-SUBJECTIVITY", MKEvMention.getAttribute("MK-SUBJECTIVITY")); ACEEvMention.setAttribute("MK-POLARITY", MKEvMention.getAttribute("MK-POLARITY")); ACEEvMention.setAttribute("MK-TENSE", MKEvMention.getAttribute("MK-TENSE")); NodeList MKEvMentionEvidenceList = MKEvMention .getElementsByTagName("event_mention_mk_evidence"); for (int j = 0; j < MKEvMentionEvidenceList.getLength(); j++) { Element evidenceEl = (Element) MKEvMentionEvidenceList .item(j); Node evidenceCopy = ACEDoc.importNode(evidenceEl, true); ACEEvMention.appendChild(evidenceCopy); } } else { System.out.println("MKEvMention node not found"); } } TransformerFactory transformerFactory = TransformerFactory .newInstance(); Transformer transformer = transformerFactory.newTransformer(); DOMImplementation domImpl = ACEDoc.getImplementation(); DocumentType doctype = domImpl.createDocumentType("doctype", "", "apf.v6.1.1.dtd"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty( "{http://xml.apache.org/xslt}indent-amount", "2"); transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId()); DOMSource source = new DOMSource(ACEDoc); StreamResult result = new StreamResult(new File(outputFilePath)); transformer.transform(source, result); } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) throws ParserConfigurationException { if(args.length == 3) { CombineMKWithACE comb = new CombineMKWithACE(args[0], args[1], args[2]); comb.combineDirectories(); } else { System.out.println("java CombineMKWithACE.java "); } } }