Commit 76811864 authored by jcolosi's avatar jcolosi
Browse files

Updates to the UniqueUnion routine.

UniqueUnion now finds the 'source' files in the root, reads as UTF-8,
converts from U-label to A-label, and then generates the merged file.
All data is deposited in the 'data' directory and labeled for the day of
creation.
parent 2bf1b41c
No preview for this file type
......@@ -2,10 +2,11 @@ package com.verisign.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
......@@ -13,67 +14,154 @@ import java.util.Date;
import java.util.Set;
import java.util.TreeSet;
import com.vgrs.xcode.common.Unicode;
import com.vgrs.xcode.idna.Idna;
import com.vgrs.xcode.idna.Punycode;
import com.vgrs.xcode.util.XcodeException;
/**
* Support merge many files into a unique list.
* Process various text files, converting to A-labels, and creating a single
* union file with all unique records.
* <p>
* @author jcolosi
* @version 1.0 Jan 8, 2014
* @version 2.0 Apr 16, 2014
*/
public class UniqueUnion {
static class Config {
/**
* A file filter to operate on .txt files only.
* Date formats
*/
static private final FilenameFilter txtFilesOnly = new FilenameFilter() {
public boolean accept(File directory, String fileName) {
return fileName.endsWith(".txt");
static final SimpleDateFormat fileFormat = new SimpleDateFormat("yyyy-MM-dd");
/**
* Dir/Name.Label.Ext
*/
static final String actionExt = "txt";
static final String outDir = "data/" + fileFormat.format(new Date());
static final String outLabel = "ALabel";
static final String outExt = "txt";
static final String mergeName = "ReservedNames";
static final String mergeLabel = fileFormat.format(new Date());
static final String mergeExt = "out";
static final String binDir = "bin";
static final String TwoCharacterLabels = "S5.2.txt";
static final String readmeFile = "# Readme.log";
static final String readmeContent = ""
+ "# Warning: Do Not Edit!\n"
+ "# The files in this directory are generated from the 'source' files\n"
+ "# at the root of the repository. Each source file has been converted\n"
+ "# to A-Labels. All source files have been merged into the\n"
+ "# ReservedNames.<date>.out file containing all unique Reserved Names.\n";
}
};
/**
* Constants in support of the output file name.
* A file filter to operate on files with a certain extension.
*/
static private final String dateFormat = "yyyy-MM-DD_kk-mm-ss_S";
static private final SimpleDateFormat fileFormat = new SimpleDateFormat(dateFormat);
static private final String outFilename = "UniqueUnion.%s.out";
static private final FilenameFilter filter = new FilenameFilter() {
public boolean accept(File directory, String filename) {
File file = new File(directory, filename);
return file.isFile() && filename.endsWith("." + Config.actionExt);
}
};
static private Idna idna;
static {
try {
idna = new Idna(new Punycode());
} catch (XcodeException e) {
throw new RuntimeException(e);
}
}
static public void main(String[] args) throws IOException {
UniqueUnion.execute();
}
/**
* Merge .txt files in the current directory into a unique list.
* Process files in the current directory and merge into a unique list.
*/
static private void execute() {
Set<String> uniqueList = new TreeSet<String>();
Date executeTime = new Date();
String datetime = fileFormat.format(executeTime);
StringBuilder fileList;
/**
* Collect .txt files in the current directory
* Collect files in the current directory
*/
File root = new File(System.getProperty("user.dir"));
File[] files = root.listFiles(txtFilesOnly);
if (root.getName().equals(Config.binDir)) {
root = root.getParentFile();
}
File[] files = root.listFiles(filter);
/**
* Establish an output directory
*/
File outDir = new File(root, Config.outDir);
outDir.mkdirs();
/**
* For each .txt file, normalize and add to the unique list
* For each file, normalize and add to the unique list
*/
for (File file : files) {
if (file.getName().equalsIgnoreCase(Config.TwoCharacterLabels)) continue;
fileList = new StringBuilder();
try {
for (String item : readFile(file)) {
item = normalize(item);
if (item != null) uniqueList.add(item);
for (String line : readFile(file)) {
String item = normalize(line);
if (item == null) continue; // Comment
item = getALabel(item);
if (item == null) continue; // Invalid
fileList.append(item + "\n");
uniqueList.add(item);
}
} catch (FileNotFoundException e) {
System.err.println("Cannot find file: " + file);
} catch (IOException e) {
System.err.println("Cannot read file: " + file);
}
/**
* Write section file with A-Labels
*/
String outfilename = removeExtension(file.getName());
outfilename += "." + Config.outLabel;
outfilename += "." + Config.outExt;
File outfile = new File(outDir, outfilename);
writeFile(outfile, fileList.toString());
}
/**
* Write unique list to a single output file
* Write unique list file
*/
File outfile = new File(String.format(outFilename, datetime));
String outfilename = Config.mergeName;
outfilename += "." + Config.mergeLabel;
outfilename += "." + Config.outExt;
File outfile = new File(outDir, outfilename);
writeFile(outfile, toString(uniqueList));
/**
* Write readme file
*/
File readmefile = new File(outDir, Config.readmeFile);
writeFile(readmefile, Config.readmeContent);
}
/**
* Encode string in Punycode
* @param aString The String to be encoded.
* @return The Punycode String.
*/
static private String getALabel(final String aString) {
char[] input = aString.toCharArray();
try {
writeFile(outfile, uniqueList);
} catch (IOException e) {
System.err.println("Cannot write file: " + outfile);
return new String(idna.domainToAscii(Unicode.encode(input)));
} catch (XcodeException e) {
e.printStackTrace();
return null;
}
}
......@@ -82,7 +170,7 @@ public class UniqueUnion {
* @param aString The String to be normalized.
* @return The normalized String.
*/
static private String normalize(String aString) {
static private String normalize(final String aString) {
String input = aString;
input = input.replaceFirst("\\s*#.*", "");
input = input.trim();
......@@ -90,37 +178,6 @@ public class UniqueUnion {
return input.length() > 0 ? input : null;
}
/**
* Lowercase only the Basic Latin alphabet to preserve utf-8.
* @param aString The String to be lowercased.
* @return The lowercase String.
*/
static private String toLowerCaseAscii(String aString) {
StringBuilder out = new StringBuilder();
for (char c : aString.toCharArray()) {
if (c >= 'A' && c <= 'Z') c = Character.toLowerCase(c);
out.append(c);
}
return out.toString();
}
/**
* Write the contents of a Set into an output file.
* @param outfile The file into which the set will be written.
* @param set The contents to write to the output file.
* @throws IOException If the output file cannot be opened.
*/
static private void writeFile(File outfile, Set<String> set) throws IOException {
PrintWriter writer = new PrintWriter(outfile);
StringBuilder out = new StringBuilder();
for (String item : set) {
out.append(item + "\n");
}
writer.print(out.toString());
writer.flush();
writer.close();
}
/**
* Read the contents of an input file and store each line as a String object
* in an ArrayList.
......@@ -134,7 +191,8 @@ public class UniqueUnion {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(infile));
reader = new BufferedReader(new InputStreamReader(new FileInputStream(infile),
"UTF-8"));
while ((line = reader.readLine()) != null) {
list.add(line.trim());
}
......@@ -151,8 +209,51 @@ public class UniqueUnion {
return list;
}
static public void main(String[] args) throws IOException {
UniqueUnion.execute();
static private String removeExtension(String filename) {
int index = filename.lastIndexOf(".");
if (index == -1) return filename;
return filename.substring(0, index);
}
/**
* Lowercase only the Basic Latin alphabet to preserve utf-8.
* @param aString The String to be lowercased.
* @return The lowercase String.
*/
static private String toLowerCaseAscii(String aString) {
StringBuilder out = new StringBuilder();
for (char c : aString.toCharArray()) {
if (c >= 'A' && c <= 'Z') c = Character.toLowerCase(c);
out.append(c);
}
return out.toString();
}
static private String toString(Set<String> set) {
StringBuilder out = new StringBuilder();
for (String item : set) {
out.append(item + "\n");
}
return out.toString();
}
/**
* Write the contents of a Set into an output file.
* @param outfile The file into which the set will be written.
* @param set The contents to write to the output file.
* @throws IOException If the output file cannot be opened.
*/
static private void writeFile(File outfile, String outmessage) {
PrintWriter writer;
try {
writer = new PrintWriter(outfile);
writer.print(outmessage);
writer.flush();
writer.close();
} catch (FileNotFoundException e) {
System.err.println("Cannot write file: " + outfile);
e.printStackTrace();
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment