001    package jigcell.sbml2.math;
002    
003    import java.io.PrintWriter;
004    import java.util.ArrayList;
005    import java.util.HashMap;
006    import java.util.Hashtable;
007    import java.util.Iterator;
008    import java.util.Map;
009    
010    /**
011     * SymbolTable keeps track of all names and strings in a model and their
012     * translated names in Fortran.  The names are translated by making the strings
013     * Fortran safe.  See makeFortranIdSafe.  The symbol table can make strings safe
014     * as Fortran identifiers or safe as Fortran character strings.
015     * <p>
016     * This code is licensed under the DARPA BioCOMP Open Source License.  See
017     * LICENSE for more details.
018     *
019     * @author Jason Zwolak
020     * @author Nicholas Allen
021     */
022    
023    public class SymbolTable {
024    
025        /*
026         * The following constants deal with mutating strings when there already
027         * exists a string of the same name.  See the routine findAFortranName for
028         * more documentation.
029         */
030        public static final int ID_START = 1;
031        public static final int ID_END = 9999;
032        public static final int ID_WIDTH = 4;
033    
034        /**
035         * These constants describe the type of symbol table.  A FORTRAN_IDENTIFIER
036         * symbol table will create symbols suitable as Fortran identifiers.  A
037         * FORTRAN_CHARACTER_STRING symbol table will create symbols suitable as
038         * Fortran character strings.  IDENTITY will not modify the name at all.  It
039         * will simply copy it to "fortranName".  It will not modify the name even
040         * if it is identical to another fortranName.
041         */
042        public static final class Mode {
043            public static final Mode FORTRAN_IDENTIFIER       =
044                new Mode( "FORTRAN_INDENTIFIER"     , 31          )
045            ;
046            /**
047             * The mode for Fortran character strings.  This mode limits characters
048             * to the length of 65000.  This is an empirically determined number for
049             * the the Lahey-Fujitsu Fortran 95 compiler v6.1 Express for Linux.
050             */
051            public static final Mode FORTRAN_CHARACTER_STRING =
052                new Mode( "FORTRAN_CHARACTER_STRING", 65000 )
053            ;
054            public static final Mode IDENTITY =
055                new Mode( "IDENTITY", Integer.MAX_VALUE )
056                //new Mode( "IDENTITY", (1<<31) - 1 )
057            ;
058            private String name;
059            private int maxLength;
060            public Mode( String name, int maxLength) {
061                this.name      = name;
062                this.maxLength = maxLength;
063            }
064            public int maxLength() { return maxLength; }
065            public String toString() { return name; }
066        }
067        /*
068         * forbiddenSymbols ares symbols already defined in the Fortran Code and
069         * cannot be used by the modeller.
070         *
071         * Note this isn't used yet, it is just a place holder and reminder.
072         */
073        private static Hashtable forbiddenSymbols;
074    
075        /*
076         * nameCol - list of names in the symbol table.
077         * fortranNameCol - list of fortran names in the symbol table.
078         * idCol - list of ids in the symbol table
079         * nameColRev - list of names in the symbol table.
080         * fortranNameColRev - list of fortran names in the symbol table.
081         *
082         * nameCol, fortranNameCol, and idCol form the symbol table with nameCol
083         * containing the names that are keys.
084         *
085         * nameColRev and fortranNameColRev provide reverse lookup of names based on
086         * fortran names.
087         *
088         * nameCol and fortranNameColRev are alphabetically sorted.  fortranNameCol,
089         * idCol, and nameColRev are ordered so they match up with their
090         * counterpart names in nameCol and fortranNameColRev.
091         *
092         * fortranNameColRev has all its entries capitolized.  Fortran is case
093         * insensitive.  Therefore, the names must be unique regardless of case.
094         * Capitolizing all entries in fortranNameColRev ensures this.
095         *
096         * fortranNameCol contains the names that will be written to the Fortran
097         * file.
098         *
099         * idNameMap contains a hash of id-name pairs.  This is for quickly looking
100         * up names given an id.  Note that the ids are assumed to be unique (an
101         * exception is thrown otherwise, see code below to ensure an exception is
102         * really thrown).
103         */
104        private ArrayList nameCol;
105        private ArrayList fortranNameCol;
106        private ArrayList idCol;
107        private ArrayList nameColRev;
108        private ArrayList fortranNameColRev;
109        private Hashtable idNameMap;
110    
111        /*
112         * maxLength specifies the maximum length of symbols in this table.  After
113         * a symbol is mangled it is guaranteed to be this length or smaller.
114         *
115         * mode specifies the mode of this symbol table.  The mod can be
116         * TYPE_IDENTIFIER or TYPE_CHARACTER_STRING.  See TYPE_IDENTIFIER and
117         * TYPE_CHARACTER_STRING for more info.
118         *
119         * If padToLength is true then FORTRAN_CHARACTER_STRINGs will be padded with
120         * the padCharacter to the maxLength.
121         */
122        private int maxLength;
123        private Mode mode = Mode.FORTRAN_IDENTIFIER;
124        private boolean padToLength = false;
125        private char padCharacter = ' ';
126    
127        /**
128         * Create a new empty symbol table.
129         */
130        public SymbolTable() {
131            maxLength         = mode.maxLength();
132            nameCol           = new ArrayList();
133            fortranNameCol    = new ArrayList();
134            idCol             = new ArrayList();
135            nameColRev        = new ArrayList();
136            fortranNameColRev = new ArrayList();
137            idNameMap         = new Hashtable();
138        }
139    
140        /**
141         * Create a new SymbolTable object with specified maximum ID length.  The
142         * user may wish to limit the maximum ID length to smaller than Fortran
143         * 95's limit on the length.  This is useful if the user wishes to prepend
144         * or append strings with the '__' (double underscore) separator for a new
145         * unique ID based on an old ID.  This is also useful if Fortran is not the
146         * target for code generation.  The maxLength cannot be greater than the
147         * default for the given mode.
148         */
149        public SymbolTable( int maxLength ) {
150            this();
151            this.maxLength =
152                maxLength > mode.maxLength() ?
153                mode.maxLength() : maxLength
154            ;
155        }
156    
157        /**
158         * Creates a new empty SymbolTable object with the specified mode.  The mode
159         * will control the nature in which the symbol table mangles symbols.  See
160         * the {@link Mode} documentation for more information.
161         */
162        public SymbolTable( Mode mode ) {
163            this();
164            this.mode = mode;
165            this.maxLength = mode.maxLength();
166        }
167    
168        /**
169         * Creates a new empty SymbolTable with the specified mode and maximum id
170         * length.
171         */
172        public SymbolTable( Mode mode, int maxLength ) {
173            this();
174            this.mode = mode;
175            this.maxLength = maxLength;
176        }
177    
178        /**
179         * Creates a new empty SymbolTable with the specified mode and maximum id
180         * length and request to pad.
181         */
182        public SymbolTable( Mode mode, int maxLength, boolean padToLength ) {
183            this();
184            this.mode = mode;
185            this.maxLength = maxLength;
186            this.padToLength = padToLength;
187        }
188    
189        /**
190         * Write the symbol table to out.
191         */
192        public void dumpSymbolTable( PrintWriter out ) {
193            Iterator n = nameCol.iterator();
194            Iterator f = fortranNameCol.iterator();
195            while ( n.hasNext() && f.hasNext() ) {
196                String name = (String)n.next();
197                out.println(name+" ("+lookupId(name)+")"+" => "+f.next());
198            }
199            out.flush();
200        }
201        public void dumpSymbolTable( ) {
202            dumpSymbolTable(new PrintWriter(System.out));
203        }
204    
205        public void emptySymbolTable(){
206            nameCol.clear();
207            fortranNameCol.clear();
208            idCol.clear();
209            nameColRev.clear();
210            fortranNameColRev.clear();
211            idNameMap.clear();
212        }
213    
214        public Map getSymbolToIdMap() {
215            Map map = new HashMap();
216            for (int i=0; i<fortranNameCol.size(); i++) {
217                map.put(fortranNameCol.get(i),idCol.get(i));
218            }
219            return map;
220        }
221    
222        /**
223         * Find the fortranName for name.  If name doesn't exist in the current
224         * symbol table then return null otherwise return the fortran name.
225         *
226         * @return Fortran name on success, null on error.
227         */
228        public String lookupSymbol( String name ) {
229            int index = lookupSymbolIndex( name, nameCol );
230            if ( index < 0 ) return null;
231            return (String)fortranNameCol.get(index);
232        }
233        /**
234         * Returns the Fortran name for id.
235         */
236        public String lookupSymbolById( String id ) {
237            String name = (String)idNameMap.get(id);
238            if ( name == null )
239                return null;
240            String ret = lookupSymbol(name);
241            return ret;
242        }
243        /**
244         * Returns the id for the JigCell name.
245         */
246        public String lookupId( String name ) {
247            int index = lookupSymbolIndex( name, nameCol );
248            if ( index < 0 ) return null;
249            return (String)idCol.get(index);
250        }
251        /**
252         * Returns the JigCell name for id.
253         */
254        public String lookupName( String id ) {
255            return (String)idNameMap.get(id);
256        }
257        /**
258         * Returns the JigCell id for fortranName.
259         */
260        public String reverseLookupId( String fortranName ) {
261            int index = lookupFortranNameIndex( fortranName );
262            if ( index < 0 ) return null;
263            return lookupId( (String)nameColRev.get(index) );
264        }
265    
266    
267        /**
268         * Finds fname in fortranNameColRev.  Converts fname to upper case first
269         * because all entries in fortranNameColRev are upper case.  Calls
270         * lookupSymbolIndex to perform the search.
271         */
272        private int lookupFortranNameIndex( String fname ) {
273            if ( mode == Mode.IDENTITY )
274                return lookupSymbolIndex( fname, fortranNameColRev );
275            return lookupSymbolIndex( fname.toUpperCase(), fortranNameColRev );
276        }
277    
278        /**
279         * Finds the index of name in the internal symbol table.  If name doesn't
280         * exist in the table then the index returned is the negative of the place
281         * in the table where this symbol should be placed -1; the minus one makes
282         * the code work in the case where the symbol should be inserted at the 0th
283         * position.
284         *
285         * @return Index of name if name is in the symbol table, negative position
286         * in the table otherwise.
287         */
288        private int lookupSymbolIndex( String name, ArrayList array ) {
289            int left = 0, right = array.size();
290            int next = ( left + right ) / 2;
291            if ( right == left ) return -1;
292            int compare = 0;
293            if ( next < array.size() )
294                compare = name.compareTo( array.get(next) );
295            while ( right > left && compare != 0 ) {
296                if ( compare > 0 ) left  = next + 1;
297                else               right = next;
298                next = ( left + right ) / 2;
299                if ( next < array.size() )
300                    compare = name.compareTo( array.get(next) );
301            }
302            if ( compare == 0 ) return next;
303            return -next-1;
304        }
305    
306        /**
307         * Add <i>id</i> to the symbol table.  If name already exists then throw an
308         * exception.  This routine adds a symbol with the id <i>id</i> and the
309         * name <i>id</i>.  In other words, the id is copied to the name since the
310         * name isn't present and symbols are added by name.
311         *
312         * @param id The ID of the symbol table entry.
313         * @return The Fortran name of name.
314         */
315        public String addSymbol( String id ) throws Exception {
316            return addSymbol( id, id );
317        }
318        /**
319         * Add a symbol with name and id.  All ids entered here must be unique or
320         * an exception is thrown.
321         */
322        public String addSymbol( String name, String id ) throws Exception {
323            if ( name == null ) {
324                System.err.println(
325                    "WARNING: symbol name is null, setting name = id"
326                );
327                name = id;
328            }
329            if ( id   == null ) throw new Exception("id is null");
330            if ( nameCol.size() == Integer.MAX_VALUE )
331                throw new Exception( "Symbol table is full." );
332            String fortranName = makeFortranSafe( name, mode, maxLength );
333            int index        = lookupSymbolIndex( name, nameCol );
334            int fortranIndex = lookupFortranNameIndex( fortranName );
335            if ( index >= 0 )
336                throw new Exception( name+" already exists in the symbol table." );
337            if ( fortranIndex >= 0 && mode != Mode.IDENTITY )
338                fortranName = findAFortranName( fortranName );
339            int forIndex = lookupFortranNameIndex( fortranName );
340            nameCol.add(           -index-1,    name                      );
341            fortranNameCol.add(    -index-1,    fortranName               );
342            idCol.add(             -index-1,    id                        );
343            nameColRev.add(        -forIndex-1, name                      );
344            if ( mode == Mode.IDENTITY )
345                fortranNameColRev.add( -forIndex-1, fortranName );
346            else
347                fortranNameColRev.add( -forIndex-1, fortranName.toUpperCase() );
348            if ( idNameMap.get(id) != null )
349                throw new Exception("Id "+id+" isn't unique.");
350            idNameMap.put(    id, name );
351            return fortranName;
352        }
353    
354        /**
355         * Takes a valid Fortran identifier name or string and finds a similar
356         * identifier name
357         * that is not already used in fortranNameCol.  If name is already in
358         * fortranNameCol then findAFortranName mutates name in a recognizable way
359         * until it is unique among all the fortranNameCol rows or findAFortranName
360         * has exhausted its mutations.  If the mutations are exhausted then an
361         * exception is thrown.
362         */
363        public String findAFortranName( String name ) throws Exception {
364            int index = 0;
365            int id = ID_START;
366            String suffix = "";
367            String newName = "";
368            if ( lookupFortranNameIndex( name ) < 0 ) return name;
369            // Append or replace end with _id.
370            suffix = Integer.toString(id);
371            while ( suffix.length() < ID_WIDTH ) { suffix = "0" + suffix; }
372            suffix = "_" + suffix;
373            if ( name.length() < maxLength - suffix.length() ) {
374                newName = name + suffix;
375            } else {
376                newName =
377                    name.substring( 0, maxLength - suffix.length() )
378                    + suffix
379                ;
380            }
381            // If name still in fortranNameCol then increment id and replace old id
382            // in name with new.  Only repeat until id = ID_END.
383            while (
384                ( index = lookupFortranNameIndex( newName ) ) >= 0
385                && id < ID_END
386            ) {
387                id++;
388                suffix = Integer.toString(id);
389                while ( suffix.length() < ID_WIDTH ) { suffix = "0" + suffix; }
390                suffix = "_" + suffix;
391                if ( name.length() < maxLength - suffix.length() ) {
392                    newName = name + suffix;
393                } else {
394                    newName =
395                        name.substring( 0, maxLength - suffix.length() )
396                        + suffix
397                    ;
398                }
399            }
400            if ( index >= 0 )
401                throw new Exception( "A unique Fortran name couldn't be found." );
402            return newName;
403        }
404    
405        /**
406         * Returns a Fortran safe string according to <i>mode</i>
407         */
408        public String makeFortranSafe( String string )
409            throws Exception
410        {
411            return makeFortranSafe(
412                string, mode, maxLength, padToLength, padCharacter
413            );
414        }
415    
416        /**
417         * Makes string Fortran safe according to <i>mode</i>.
418         */
419        public String makeFortranSafe( String string, Mode mode )
420            throws Exception
421        {
422            return makeFortranSafe(
423                string, mode, maxLength, padToLength, padCharacter
424            );
425        }
426    
427        public String makeFortranSafe(
428            String string, Mode mode, int maxLength
429        ) throws Exception {
430            return makeFortranSafe(
431                string, mode, maxLength, padToLength, padCharacter
432            );
433        }
434    
435        /**
436         * Makes string Fortran safe according to <i>mode</i> and <i>maxLength</i>.
437         */
438        public static String makeFortranSafe(
439            String string, Mode mode, int maxLength, boolean padToLength,
440            char padCharacter
441        ) throws Exception {
442            if        ( mode == Mode.IDENTITY ) {
443                return string;
444            } else if ( mode == Mode.FORTRAN_IDENTIFIER       ) {
445                return makeFortranIdSafe_( string, maxLength );
446            } else if ( mode == Mode.FORTRAN_CHARACTER_STRING ) {
447                return makeFortranCharacterSafe_(
448                    string, maxLength, padToLength, padCharacter )
449                ;
450            } else throw new Exception("Unrecognized mode.");
451        }
452    
453        /**
454         * Makes a string safe as a Fortran character string.  Uses the same
455         * algorithm as {@link #makeFortranIdSafe_(String,int)} with the following
456         * differences:
457         * <ul>
458         * <li>No string is ever prepended.
459         * <li>The characters [0-9A-Za-z~`!#$%&amp;*()_+-=[]\{};':",./&lt;&gt;? ]
460         * are permitted anywhere in the string.
461         * <li>The characters [@^|] are converted to _u??, u??_, or _u??_ depending
462         * on whether the character is at the end, beginning, or middle of the
463         * string, respectively.
464         * <li>All other characters are converted to _u??, u??_, or _u??_ depending
465         * on whether the character is at the end, beginning, or middle of the
466         * string, respectively.
467         * </ul>
468         */
469        public static String makeFortranCharacterSafe_(
470            String string, int maxLength, boolean padToLength, char padCharacter
471        ) throws Exception {
472            StringBuffer out = new StringBuffer();
473            StringBuffer in  = new StringBuffer(string);
474            while ( in.length() > 0 && out.length() < maxLength ) {
475                if ( in.toString().matches(
476                    "^[0-9A-Za-z~`!#$%&*()_+\\-=\\[\\]\\\\{};':\",./<>? ].*")
477                ) {
478                    out.append(in.charAt(0));
479                } else {
480                    if ( out.length() > 1 ) out.append("_");
481                    out.append("u"+Integer.toHexString((int)in.charAt(0)));
482                    if ( in.length() > 1 ) out.append("_");
483                }
484                in.deleteCharAt(0);
485            }
486            if ( out.length() > maxLength ) {
487                return out.substring(0,maxLength);
488            }
489            if ( padToLength )
490                while ( out.length() < maxLength )
491                    out.append(padCharacter);
492            return out.toString();
493        }
494    
495        /**
496         * Convert a string to a Fortran safe identifier.
497         * <p><b>Documentation out of date.</b><p>
498         * Fortran 95 only accepts
499         * [A-Za-z0-9_] in identifier names.  Addtionally an identifier must begin
500         * with a letter and be less than 31 characters.  Identifiers can be
501         * Fortran keywords.
502         * <p>
503         * The algorithm works as follows:
504         <pre>
505         If the first character is not a letter prepend PREFIX.
506         Convert series of ['"]* to _ppp_ if in the middle of the string and _ppp if
507             at the end, where the number of ps is the number of
508             apostrophes (each ' counts as one p and each " counts as 2).
509         Convert all other non alphanumeric characters to a number as they are
510             defined in Java.  Surround the number with "_" if in the middle of the
511             string and prepend with "_" if at the end.  Underscores are also
512             converted to numbers.  This allows symbols from the symbol table to
513             take suffixes of the form "__MYUSERSUFFIX" and still guarantee
514             uniqueness.  The user must take care not to use suffixes that might be
515             generated by this routine (i.e. _p, and _u34).  Also, the user must
516             take care not to exceed the Fortran limit on variable names.  This
517             routine allows the user to restrict var name length to ensure suffixes
518             can be appended.
519         </pre>
520         */
521        public String makeFortranIdSafe( String string ) throws Exception {
522            return makeFortranIdSafe_( string, maxLength );
523        }
524    
525       private static String makeFortranIdSafe_ (String in, int maxLength) {
526          int length = in.length ();
527          if (length > maxLength)
528             length = maxLength;
529          if (length == 0)
530             return "";
531          StringBuffer out = new StringBuffer ();
532          char c = in.charAt (0);
533          int count = 0;
534          int pos = 1;
535    
536          // Avoid identifiers starting with non-alphabetic characters
537          if ((c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
538             out.append ("A_");
539             count += 2;
540          }
541    
542    outer:
543          while (count < maxLength) {
544             if (c == '_' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') {
545                out.append (c);
546                count++;
547             } else if (c == '\'' || c == '"') {
548                out.append ("_");
549                count++;
550                while (true) {
551                   if (c == '\'') {
552                      out.append ("p");
553                      count++;
554                   } else if (c == '"') {
555                      out.append ("pp");
556                      count += 2;
557                   } else
558                      continue outer;
559                   if (pos == length)
560                      break outer;
561                   c = in.charAt (pos++);
562                }
563             } else {
564                out.append ("_u");
565                String hexCode = Integer.toHexString ((int) c);
566                out.append (hexCode);
567                count += 2 + hexCode.length ();
568             }
569             if (pos == length)
570                break;
571             c = in.charAt (pos++);
572          }
573          return count > maxLength ? out.substring (0, maxLength) : out.toString ();
574       }
575    }