Commit 76811864 authored by jcolosi's avatar jcolosi
Browse files

Updates to the UniqueUnion routine.

UniqueUnion now finds the 'source' files in the root, reads as UTF-8,
converts from U-label to A-label, and then generates the merged file.
All data is deposited in the 'data' directory and labeled for the day of
creation.
parent 2bf1b41c
No preview for this file type
...@@ -2,10 +2,11 @@ package com.verisign.util; ...@@ -2,10 +2,11 @@ package com.verisign.util;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FilenameFilter; import java.io.FilenameFilter;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
...@@ -13,67 +14,154 @@ import java.util.Date; ...@@ -13,67 +14,154 @@ import java.util.Date;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import com.vgrs.xcode.common.Unicode;
import com.vgrs.xcode.idna.Idna;
import com.vgrs.xcode.idna.Punycode;
import com.vgrs.xcode.util.XcodeException;
/** /**
* Support merge many files into a unique list. * Process various text files, converting to A-labels, and creating a single
* union file with all unique records.
* <p>
* @author jcolosi * @author jcolosi
* @version 1.0 Jan 8, 2014 * @version 1.0 Jan 8, 2014
* @version 2.0 Apr 16, 2014
*/ */
public class UniqueUnion { public class UniqueUnion {
static class Config {
/**
* Date formats
*/
static final SimpleDateFormat fileFormat = new SimpleDateFormat("yyyy-MM-dd");
/**
* Dir/Name.Label.Ext
*/
static final String actionExt = "txt";
static final String outDir = "data/" + fileFormat.format(new Date());
static final String outLabel = "ALabel";
static final String outExt = "txt";
static final String mergeName = "ReservedNames";
static final String mergeLabel = fileFormat.format(new Date());
static final String mergeExt = "out";
static final String binDir = "bin";
static final String TwoCharacterLabels = "S5.2.txt";
static final String readmeFile = "# Readme.log";
static final String readmeContent = ""
+ "# Warning: Do Not Edit!\n"
+ "# The files in this directory are generated from the 'source' files\n"
+ "# at the root of the repository. Each source file has been converted\n"
+ "# to A-Labels. All source files have been merged into the\n"
+ "# ReservedNames.<date>.out file containing all unique Reserved Names.\n";
}
/** /**
* A file filter to operate on .txt files only. * A file filter to operate on files with a certain extension.
*/ */
static private final FilenameFilter txtFilesOnly = new FilenameFilter() { static private final FilenameFilter filter = new FilenameFilter() {
public boolean accept(File directory, String fileName) { public boolean accept(File directory, String filename) {
return fileName.endsWith(".txt"); File file = new File(directory, filename);
return file.isFile() && filename.endsWith("." + Config.actionExt);
} }
}; };
/** static private Idna idna;
* Constants in support of the output file name.
*/ static {
static private final String dateFormat = "yyyy-MM-DD_kk-mm-ss_S"; try {
static private final SimpleDateFormat fileFormat = new SimpleDateFormat(dateFormat); idna = new Idna(new Punycode());
static private final String outFilename = "UniqueUnion.%s.out"; } catch (XcodeException e) {
throw new RuntimeException(e);
}
}
static public void main(String[] args) throws IOException {
UniqueUnion.execute();
}
/** /**
* Merge .txt files in the current directory into a unique list. * Process files in the current directory and merge into a unique list.
*/ */
static private void execute() { static private void execute() {
Set<String> uniqueList = new TreeSet<String>(); Set<String> uniqueList = new TreeSet<String>();
Date executeTime = new Date(); StringBuilder fileList;
String datetime = fileFormat.format(executeTime);
/** /**
* Collect .txt files in the current directory * Collect files in the current directory
*/ */
File root = new File(System.getProperty("user.dir")); File root = new File(System.getProperty("user.dir"));
File[] files = root.listFiles(txtFilesOnly); if (root.getName().equals(Config.binDir)) {
root = root.getParentFile();
}
File[] files = root.listFiles(filter);
/**
* Establish an output directory
*/
File outDir = new File(root, Config.outDir);
outDir.mkdirs();
/** /**
* For each .txt file, normalize and add to the unique list * For each file, normalize and add to the unique list
*/ */
for (File file : files) { for (File file : files) {
if (file.getName().equalsIgnoreCase(Config.TwoCharacterLabels)) continue;
fileList = new StringBuilder();
try { try {
for (String item : readFile(file)) { for (String line : readFile(file)) {
item = normalize(item); String item = normalize(line);
if (item != null) uniqueList.add(item); if (item == null) continue; // Comment
item = getALabel(item);
if (item == null) continue; // Invalid
fileList.append(item + "\n");
uniqueList.add(item);
} }
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
System.err.println("Cannot find file: " + file); System.err.println("Cannot find file: " + file);
} catch (IOException e) { } catch (IOException e) {
System.err.println("Cannot read file: " + file); System.err.println("Cannot read file: " + file);
} }
/**
* Write section file with A-Labels
*/
String outfilename = removeExtension(file.getName());
outfilename += "." + Config.outLabel;
outfilename += "." + Config.outExt;
File outfile = new File(outDir, outfilename);
writeFile(outfile, fileList.toString());
} }
/** /**
* Write unique list to a single output file * Write unique list file
*/ */
File outfile = new File(String.format(outFilename, datetime)); String outfilename = Config.mergeName;
outfilename += "." + Config.mergeLabel;
outfilename += "." + Config.outExt;
File outfile = new File(outDir, outfilename);
writeFile(outfile, toString(uniqueList));
/**
* Write readme file
*/
File readmefile = new File(outDir, Config.readmeFile);
writeFile(readmefile, Config.readmeContent);
}
/**
* Encode string in Punycode
* @param aString The String to be encoded.
* @return The Punycode String.
*/
static private String getALabel(final String aString) {
char[] input = aString.toCharArray();
try { try {
writeFile(outfile, uniqueList); return new String(idna.domainToAscii(Unicode.encode(input)));
} catch (IOException e) { } catch (XcodeException e) {
System.err.println("Cannot write file: " + outfile); e.printStackTrace();
return null;
} }
} }
...@@ -82,7 +170,7 @@ public class UniqueUnion { ...@@ -82,7 +170,7 @@ public class UniqueUnion {
* @param aString The String to be normalized. * @param aString The String to be normalized.
* @return The normalized String. * @return The normalized String.
*/ */
static private String normalize(String aString) { static private String normalize(final String aString) {
String input = aString; String input = aString;
input = input.replaceFirst("\\s*#.*", ""); input = input.replaceFirst("\\s*#.*", "");
input = input.trim(); input = input.trim();
...@@ -90,37 +178,6 @@ public class UniqueUnion { ...@@ -90,37 +178,6 @@ public class UniqueUnion {
return input.length() > 0 ? input : null; return input.length() > 0 ? input : null;
} }
/**
* Lowercase only the Basic Latin alphabet to preserve utf-8.
* @param aString The String to be lowercased.
* @return The lowercase String.
*/
static private String toLowerCaseAscii(String aString) {
StringBuilder out = new StringBuilder();
for (char c : aString.toCharArray()) {
if (c >= 'A' && c <= 'Z') c = Character.toLowerCase(c);
out.append(c);
}
return out.toString();
}
/**
* Write the contents of a Set into an output file.
* @param outfile The file into which the set will be written.
* @param set The contents to write to the output file.
* @throws IOException If the output file cannot be opened.
*/
static private void writeFile(File outfile, Set<String> set) throws IOException {
PrintWriter writer = new PrintWriter(outfile);
StringBuilder out = new StringBuilder();
for (String item : set) {
out.append(item + "\n");
}
writer.print(out.toString());
writer.flush();
writer.close();
}
/** /**
* Read the contents of an input file and store each line as a String object * Read the contents of an input file and store each line as a String object
* in an ArrayList. * in an ArrayList.
...@@ -134,7 +191,8 @@ public class UniqueUnion { ...@@ -134,7 +191,8 @@ public class UniqueUnion {
BufferedReader reader = null; BufferedReader reader = null;
try { try {
reader = new BufferedReader(new FileReader(infile)); reader = new BufferedReader(new InputStreamReader(new FileInputStream(infile),
"UTF-8"));
while ((line = reader.readLine()) != null) { while ((line = reader.readLine()) != null) {
list.add(line.trim()); list.add(line.trim());
} }
...@@ -151,8 +209,51 @@ public class UniqueUnion { ...@@ -151,8 +209,51 @@ public class UniqueUnion {
return list; return list;
} }
static public void main(String[] args) throws IOException { static private String removeExtension(String filename) {
UniqueUnion.execute(); int index = filename.lastIndexOf(".");
if (index == -1) return filename;
return filename.substring(0, index);
}
/**
* Lowercase only the Basic Latin alphabet to preserve utf-8.
* @param aString The String to be lowercased.
* @return The lowercase String.
*/
static private String toLowerCaseAscii(String aString) {
StringBuilder out = new StringBuilder();
for (char c : aString.toCharArray()) {
if (c >= 'A' && c <= 'Z') c = Character.toLowerCase(c);
out.append(c);
}
return out.toString();
}
static private String toString(Set<String> set) {
StringBuilder out = new StringBuilder();
for (String item : set) {
out.append(item + "\n");
}
return out.toString();
}
/**
* Write the contents of a Set into an output file.
* @param outfile The file into which the set will be written.
* @param set The contents to write to the output file.
* @throws IOException If the output file cannot be opened.
*/
static private void writeFile(File outfile, String outmessage) {
PrintWriter writer;
try {
writer = new PrintWriter(outfile);
writer.print(outmessage);
writer.flush();
writer.close();
} catch (FileNotFoundException e) {
System.err.println("Cannot write file: " + outfile);
e.printStackTrace();
}
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment