/* PUBLIC DOMAIN NOTICE NIH Chemical Genomics Center National Human Genome Research Institute This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as United States Government employee and thus cannot be copyrighted. This software/database is freely available to the public for use. The NIH Chemical Genomics Center (NCGC) and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NCGC and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NCGC and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the authors in any work or product based on this material. */ // $Id$ // compute atom pair (AP) descriptor type package gov.nih.ncgc.descriptor; import java.util.Map; import java.util.HashMap; import java.util.Vector; import java.util.Set; import java.util.TreeSet; import java.text.MessageFormat; import chemaxon.struc.*; import chemaxon.formats.*; import chemaxon.util.MolHandler; public class AtomPairDescriptor { private int maxPath; // max path length private static Set atomSet; private static int valtab[][] = new int[150][]; static { valtab[ 6] = new int[]{1, 2, 3, 4}; // C valtab[ 7] = new int[]{1, 2, 3, 4}; // N valtab[ 8] = new int[]{1, 2}; // O valtab[ 9] = new int[]{1}; // F valtab[15] = new int[]{1, 2, 3, 4}; // P valtab[16] = new int[]{1, 2, 3, 4}; // S valtab[17] = new int[]{1}; // Cl valtab[35] = new int[]{1}; // Br valtab[53] = new int[]{1}; // I atomSet = new TreeSet(); for (int i = 0; i < valtab.length; ++i) { if (valtab[i] != null && valtab[i].length > 0) { atomSet.add(i); } } } private Map sparseFeatures = new HashMap(); private static String nameOf (int feature) { int atype = (feature >> 16) & 0x00ff; int pi = (feature & 0x0000ffff) >> 8; int nb = (feature & 0x000000ff); String name = (atype < 0xff ? MolAtom.symbolOf(atype) : "X"); return name + pi + nb; } public AtomPairDescriptor (MoleculeGraph g) { MolAtom atoms[] = g.getAtomArray(); // compute atom property and pack it into an integer int features[] = new int[atoms.length]; for (int i = 0; i < atoms.length; ++i) { int atype = 0xff; if (atomSet.contains(atoms[i].getAtno())) { atype = atoms[i].getAtno(); } int pi = numPiElectrons (atoms[i]); int nb = atoms[i].getBondCount(); features[i] = (atype << 16) | (pi << 8) | nb; /* System.out.println((i + 1) + " " + MolAtom.symbolOf(atoms[i].getAtno()) + " " + nameOf (features[i]) + " " + features[i] + " = " + atype + " " + pi + " " + nb); */ } int [][]shortestPath = allShortestPaths (g); for (int i = 0; i < shortestPath.length; ++i) { for (int j = i+1; j < shortestPath[i].length; ++j) { String path = nameOf (Math.min(features[i], features[j])) + nameOf (Math.max(features[i], features[j])) + (shortestPath[i][j] < 10 ? "0" : "") + shortestPath[i][j]; Integer count = sparseFeatures.get(path); sparseFeatures.put(path, count != null ? count + 1 : 1); } } } public Map features () { return sparseFeatures; } public static int numPiElectrons (MolAtom atom) { if (atom.hasAromaticBond()) return 1; int valence = 0, order = atom.getBondCount(); for (int i = 0; i < order; ++i) { valence += atom.getBond(i).getType(); } return valence - order; } public static int[][] allShortestPaths (MoleculeGraph g) { int atomCount = g.getAtomCount(); int[][] shortestPaths = new int[atomCount][atomCount]; int[][] tab = g.createBHtab(); for (int i = 0; i < atomCount; ++i) { shortestPaths[i][i] = 0; for (int j = i+1; j < atomCount; ++j) { shortestPaths[j][i] = shortestPaths[i][j] = tab[i][j] < 0 ? atomCount : 1; } } tab = null; /* Floyd's all-pairs shortest path algorithm */ for (int k = 0; k < atomCount; ++k) for (int i = 0; i < atomCount; ++i) for (int j = 0; j < atomCount; ++j) shortestPaths[i][j] = Math.min (shortestPaths[i][j], shortestPaths[i][k] + shortestPaths[k][j]); /* for (int i = 0; i < atomCount; ++i) { for (int j = i+1; j < atomCount; ++j) { System.out.println ("d("+(i+1)+","+(j+1)+") = " + shortestPaths[i][j]); } } */ return shortestPaths; } public static double cityBlockDistance (MoleculeGraph g1, MoleculeGraph g2) { return cityBlockDistance (new AtomPairDescriptor (g1).features(), new AtomPairDescriptor (g2).features()); } public static double cityBlockDistance (Map f1, Map f2) { double d = 0.; for (Map.Entry e : f1.entrySet()) { Integer v2 = f2.get(e.getKey()); if (v2 != null) { // atom pair that's shared between two feature vectors d += Math.abs(e.getValue() - v2); } else { // this feature vector isn't in f2 d += e.getValue(); } } // now count atom pairs in f2 that aren't in f1 for (Map.Entry e : f2.entrySet()) { if (!f1.containsKey(e.getKey())) { d += e.getValue(); } } return d; } public static double similarity (Map f1, Map f2) { double s = 0.; int f1count = 0; for (Map.Entry e : f1.entrySet()) { Integer v2 = f2.get(e.getKey()); if (v2 != null) { s += Math.min(e.getValue(), v2); } f1count += e.getValue(); } int f2count = 0; for (Integer e : f2.values()) { f2count += e; } s *= 2./(f1count + f2count); return s; } public static double similarity (MoleculeGraph g1, MoleculeGraph g2) { return similarity (new AtomPairDescriptor (g1).features(), new AtomPairDescriptor (g2).features()); } public static void main (String argv[]) throws Exception { if (argv.length == 0) { System.out.println("AtomPairDescriptor FILES..."); System.exit(1); } Vector> features = new Vector>(); for (int i = 0; i < argv.length; ++i) { MolImporter importer = new MolImporter (argv[i]); for (Molecule mol; (mol = importer.read()) != null; ) { mol.hydrogenize(false); mol.aromatize(); mol.calcHybridization(); AtomPairDescriptor ap = new AtomPairDescriptor (mol); features.add(ap.features()); /* System.out.println("==== " + mol.getName() + " ===="); for (Map.Entry me : ap.features().entrySet()) { System.out.println(me.getKey() + " " + me.getValue()); } */ } importer.close(); } int size = features.size(); System.out.println(size); for (int i = 0; i < size; ++i) { Map fi = features.get(i); if (i+1 < size) { Map fj = features.get(i+1); System.out.print(cityBlockDistance (fi, fj)); for (int j = i + 2; j < features.size(); ++j) { fj = features.get(j); System.out.print("," + cityBlockDistance (fi, fj)); } System.out.println(); } } } }