Add additional output functionality to existing Node script
New here? Learn about Bountify and follow @bountify to get notified of new bounties! x

The following script currently MERGES multiple .json files into a single .json file, and it SPLITS single .json files into multiple .json files.

For SPLITTING, the file must contain records in the following (\n) newline format;


{"key1":"fileName1","key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}


{"key1":"fileName1","key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}


{"key1":"fileName2","key2":"path/to/folder/2/","key3":"value3","key4":"value4","key5":"value5"}

It then outputs individual .json files in the same format as above.

TASK #1:

I need the script to prompt for an output choice, to output the above format (option 1), or to output the following format (option 2), wherein option 2 requires input from the user to give it a name (e.g.- userAddedKeyword as seen below);

{"userAddedKeyword": [

{"key1":"fileName1","key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}


{"key1":"fileName1","key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}


{"key1":"fileName1","key2":"path/to/folder/2/","key3":"value3","key4":"value4","key5":"value5"}

]}

Task #2:

Currently, the script using a key value from the JSON records themselves within the .json file, and the following is an example of running the script and using the key value of 'key1';
node script.js split "splitMe.json" key1 "output_folder/"

This takes all records with the same 'key1' value, and outputs them to their own singular .json file. So, if the splitMe.json file contains 10 records with the same 'key1' value, those 10 records will go into the same single output .json file. the same is true for all other records in the splitMe.json file. At the moment, it includes that 'key1' value in the output.

I need an option to include or omit that 'key1' value from the output. the 'key1' value is used for the purposes of naming the output files, and though sometimes that value is needed in the final output, other times it is not, because it is only used for the purposes of naming the file.

Task #3:

At the moment, the script outputs all split files into the same designated folder, e.g.- "output_folder/". I need an option to include the ability to output files to unique folders by using a second 'key2' value from the splitMe.json file, similar to how 'key1' is being used to name the file... 'key2' would be used to declare the output directory/path. Likewise, 'key2' values would need to be omitted from the actual .json file output. So, if splitMe.json looked like this;


{"key1":"fileName1","key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}


{"key1":"fileName1","key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}


{"key1":"fileName2","key2":"path/to/folder/2/","key3":"value3","key4":"value4","key5":"value5"}

the records in the output .json files would look like this;


{"key1":"fileName1","key3":"value3","key4":"value4","key5":"value5"}


{"key1":"fileName1","key3":"value3","key4":"value4","key5":"value5"}


{"key1":"fileName2","key3":"value3","key4":"value4","key5":"value5"}

And if the user opted to omit the 'key1' value (used to name the .json files), then the records in the output .json files would look like this;


{"key3":"value3","key4":"value4","key5":"value5"}


{"key3":"value3","key4":"value4","key5":"value5"}


{"key3":"value3","key4":"value4","key5":"value5"}

And the output .json files would be here;

path/to/folder/1/fileName1.json. <<-- this .json file would contain 2 records

path/to/folder/2/fileName2.json. <<-- this .json file would contain 1 record

Below is a quick video I made showing how the script currently takes a splitMe.json file and outputs all of the records to their own uniquely named ('key1') .json files into a designated output_folder/:

http://somup.com/cbnoiy8Yo

Here is the script;

`
var fs = require("fs");
var action = process.argv[2];

if(action == "split"){
var filePath = process.argv[3];
var key = process.argv[4];
// Asynchronous read
fs.readFile(filePath, function (err, data) {
if (err) {
return console.error(err);
}
var lines = data.toString().split("\n");
// determine the input type
var type = "ndjson";
// Note: The comma at the end of the line is optional. I assume the format
// is [{object}],\n[{object}],\n[{object}]\EOF
if (lines[0].match(/[[]]*],?/)) {
// it's the JSON-style format [<json>],
type = "json";
}
var out = "";
for (var i = 0; i < lines.length; i++) {
if (lines[i].trim() == "") {
continue;
}
var json;
if (type == "ndjson"){
json = JSON.parse(lines[i]);
}
else if (type == "json") {
json = JSON.parse(lines[i].match(/[([]]*)],?/)[1]);
}
fs.appendFile(
process.argv[5] + "/" + json[key] + ".json",
JSON.stringify(json) + "\n",
function(){} // supresses warning
);
}
});
}
else if (action == "merge") {
var data;
// get the desired output format from the user
getFormat(function(format){
if (Number(format) == 3 && process.argv.length < 6){
console.log("You forgot to declare an index (e.g.- pid) at EOL, run script again.");
process.exit();
}
var index = process.argv[5];
var mergedString = "";
var items = fs.readdirSync(process.argv[3]);
for (var i = 0; i < items.length; i++) {
if (items[i].endsWith(".json")){
data = fs.readFileSync(process.argv[3] + '/' + items[i], "utf8");
for (var a in data.toString().split("\n")) {
var item = data.toString().split("\n")[a];
if (item != ""){
switch (Number(format)) {
case 1: // minified JSON
mergedString = mergedString + "[" + item + "],\n";
break;
case 2: // NDJSON
mergedString += item + "\n";
break;
case 3: // ESJSON
mergedString += '{"index":{"_id":"' +
JSON.parse(item)[index] +
'"}}\n' +
item +
"\n";
break;
default:
break;
}
}
}
}
}
var writeStream = fs.createWriteStream(process.argv[4]);
writeStream.write(mergedString);
writeStream.end();
writeStream.on("finish", function(){
process.exit();
});
});
}
else {
console.log("Please provide a correct action");
}

// function to use recursion to simulate syncronous access to stdin/out
function getFormat(callback){
process.stdout.write(
"Select output format: 1:minified JSON, 2: NDJSON, 3:ESJSON: "
);
process.stdin.setEncoding('utf8');
process.stdin.once('data', function(val){
// check validity of input
if (!isNaN(val) && 0 < Number(val) < 3){
callback(val);
}
else {
// if input is invalid, ask again
getFormat(callback);
}
}).resume();
}
`

What are you using this script for?
slang800 9 days ago
Also, should the destination files be emptied in the beginning of the split command?
slang800 9 days ago
And why don't you just have the whole output filepath in the user specified key?
slang800 9 days ago
When the script matures a bit more, and is capable of splitting to -and merging from- specific directories, and when it can append (instead of replace) existing files (during split), and vice versa for merging, then I will use it to move data back and forth between ElasticSearch <-> folders <-> editing apps.
ericjarvies 9 days ago
At the moment the split command overwrites existing files. after the aforementioned modifications are completed, the next tasks will include adding an APPEND option, so user can select either; OVERWRITE or; APPEND, wherein APPEND will read existing file and if it is the same as does the splitMe.json file contain, then it will ignore, and if it is different, it will add the additional records, or modify the existing records.
ericjarvies 9 days ago
At the moment the split command overwrites existing files. after the aforementioned modifications are completed, the next tasks will include adding an APPEND option, so user can select either; OVERWRITE or; APPEND, wherein APPEND will read existing file and if it is the same as does the splitMe.json file contain, then it will ignore, and if it is different, it will add the additional records, or modify the existing records. The 'whole output file path?' I do not understand your question, perhaps you could be more specific?
ericjarvies 9 days ago
I mean rather than having separate filename and directory, you would have one filepath field that contains the directory & filename as one string.
slang800 9 days ago
That's interesting that this is for ElasticSearch - wouldn't it make more sense to just edit the records directly in ES so you don't have to deal with locking the DB or merging your revisions back into the DB with the possibility of those docs having changed since you last exported them?
slang800 9 days ago
It might be more useful if you described what sort of editing you're looking to do. There are much better ways of doing ETL on a dataset than manually editing JSON files.
slang800 9 days ago
@-slang800 - One filepath field is fine. I do not manually edit .json (or .geojson, .topojson, etc.) files. Those are strictly for the purpose of version control. They are the intermediate between the editing environment and the publishing environment. Invariably, both spatial and attribute data are always edited as collections, grouped according to necessity. In mine own particular case, one of my primary editing tools is Refine for attributes, and Cartographica/OpenJump/mapshaper/gdal/etc. for spatial. Two months ago I began delving into ES for the first time, having since built/destroyed/built a few dozen clusters, until such a point I was familiar with the underlying capabilities/shortcomings/requirements/configuration/settings of ElasticSearch. I've not yet any ported data.
ericjarvies 9 days ago
awarded to farolanf

Crowdsource coding tasks.

2 Solutions


Solution is up here: https://github.com/slang800/bountify-split-json. I ended up rewriting it to provide a --help command like:

$ node lib/split.js --help
usage: split.js [-h] [-v] [--omit-key] KEY [DIRECTORY]

Read JSON from STDIN, split by key and output to directory

Positional arguments:
  KEY            Key that determines where to output JSON
  DIRECTORY      Location to output JSON files, defaults to ./

Optional arguments:
  -h, --help     Show this help message and exit.
  -v, --version  Show program's version number and exit.
  --omit-key     Remove the KEY field from records before writing.

And changed the internals to use streams, so when you have JSON files that are larger than memory (several GB) it won't crash.

For your merge script, I think it would be faster to use jq, since you're just cating a bunch of JSON together. For example, with a moderate number of files you could do this to merge and format them all:

cat dir/*.json | jq -c > output.json

output.json would look like:

{"key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}
{"key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}
{"key2":"path/to/folder/2/","key3":"value3","key4":"value4","key5":"value5"}

If you want to wrap them in an array, you can do:

cat dir/*.json | jq -c --slurp > output.json

If you have thousands/millions of files, you can do this to avoid listing all the files at once with a glob:

find dir/ -type f -name "*.json" | while read i; do cat "$i"; done | jq -c > output.json

If you want it in the ElasticSearch bulk API format, then jq can reformat it easily:

$ cat dir/*.json | jq -c '[{"index": {"_id": .key3}}, .] | .[]'
{"index":{"_id":"value3"}}
{"key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}
{"index":{"_id":"value3"}}
{"key2":"path/to/folder/1/","key3":"value3","key4":"value4","key5":"value5"}
{"index":{"_id":"value3"}}
{"key2":"path/to/folder/2/","key3":"value3","key4":"value4","key5":"value5"}

To merge the filename and folder fields in the example file you uploaded, you can use the following:

$ cat sample_records.ndjson | jq -c '.filename = .folder + .filename | del(.folder)' | node lib/split.js filename
What command splits the sample file into key folders using key names? And what command splits the file into just one folder?
ericjarvies 8 days ago
cat sample_records.ndjson | jq -c '.filename = .folder + .filename | del(.folder)' | node lib/split.js filename will give you a hierarchy based on the folder and filename keys node lib/split.js filename ./your-directory < sample_records.ndjson will just output it into ./your-directory based on the filename field.
slang800 8 days ago
I opted to read everything from STDIN, because that allows you to pipe whatever text streams you want into the process, without writing it to a file first. You can even pipe it directly from a curl request to elasticsearch.
slang800 8 days ago
@slang800 - Thank you for your input and effort. I did not want to end up with a more complicated script then what I had to begin with, but, between your solution and @farolanf's solution, his was easier... from a laymen/end user perspective. Though, in both cases, the addition of node_modules was not desirable... I would have preferred a simple -and singular- script without dependancies (like it was). That being said, I am going to post another task to add some more options to this latest script. I would like to make it easier from a non-technical end user's perspective, and would like to make it more efficient from a technical perspective, but with an emphasis on ease-of-use.
ericjarvies 8 days ago

Usage

spmer

  Split a json-line file or merge json files.                    

  A json-line file is a file containing valid json on each line. 

Usage

  node spmer.js -s FILE [options]        
  node spmer.js -m DIR [options]         

  The first form to split FILE.          
  The second form to merge files in DIR. 

Split options

  -s   --split file     a json-line file to split                                        
  -n   --name-key key   key for the name of file, will groups objects with the same file 
  -p   --path-key key   key for the output path                                          
  -t   --omit-name      omit the name key                                                
  -u   --omit-path      omit the path key                                                
  -k   --out-key key    output the groups as array value of this key                     
  -a   --append         append to existing files                                         

Merge options

  -m   --merge dir                dir with json files to merge 
  -o   --merge-output file        merge output file            
  -x   --index ESJSON index key   specify index key for ESJSON 

General options

  -d   --output-dir dir   root output dir, defaults to current dir 
  -h   --help             show this help                           

Split example

node split -s output.jl -n type -p path -tu

Split output.jl file with filename on type key, path on path key, and omit the filename and path key (-tu or -t -u).

Merge example

node split -m camera -o esjson.json -x model

Merge json files in the camera folder and output to esjson.json with index on model key.

Installation

  • Create a folder for the script and enter it
  • Create the package.json file
  • Create the spmer.js file

Then run:

npm install
npm link

Now spmer is installed globally.

package.json

{
  "name": "spmer",
  "version": "1.0.0",
  "description": "",
  "main": "spmer.js",
  "dependencies": {
    "command-line-args": "^4.0.1",
    "command-line-usage": "^4.0.0"
  },
  "devDependencies": {},
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "Farolan Faisal",
  "license": "ISC",
  "preferGlobal": true,
  "bin": {
    "spmer": "spmer.js"
  }
}

spmer.js

#!/usr/bin/env node
var fs = require("fs");
var path = require("path");
const clu = require('command-line-usage');
const cla = require('command-line-args');

const splitOptions = [
  { name: 'split', alias: 's', type: String, arg: 'file', desc: 'a json-line file to split' },
  { name: 'name-key', alias: 'n', type: String, arg: 'key', desc: 'key for the name of file, will groups objects with the same file' },
  { name: 'path-key', alias: 'p', type: String, arg: 'key', desc: 'key for the output path' },
  { name: 'omit-name', alias: 't', type: Boolean, desc: 'omit the name key' },
  { name: 'omit-path', alias: 'u', type: Boolean, desc: 'omit the path key' },
  { name: 'out-key', alias: 'k', type: String, arg: 'key', desc: 'output the groups as array value of this key' },
  { name: 'append', alias: 'a', type: Boolean, desc: 'append to existing files' },
];

const mergeOptions = [
  { name: 'merge', alias: 'm', type: String, arg: 'dir', desc: 'dir with json files to merge' },
  { name: 'merge-output', alias: 'o', type: String, arg: 'file', desc: 'merge output file' },
  { name: 'index', alias: 'x', type: String, arg: 'ESJSON index key', desc: 'specify index key for ESJSON' },
];

const generalOptions = [
  { name: 'output-dir', alias: 'd', type: String, defaultValue: '.', arg: 'dir', desc: 'root output dir, defaults to current dir' },
  { name: 'help', alias: 'h', type: Boolean, desc: 'show this help' },
];

const optionDefs = splitOptions.concat(mergeOptions).concat(generalOptions);

const help = [
  {
    header: 'spmer',
    content: [
      'Split a json-line file or merge json files.',
      '',
      'A json-line file is a file containing valid json on each line.',
    ],
  },
  {
    header: 'Usage',
    content: [
      'node spmer.js -s FILE [options]',
      'node spmer.js -m DIR [options]',
      '',
      'The first form to split FILE.',
      'The second form to merge files in DIR.',
    ],
  },
  getSectionOption('Split options', splitOptions),
  getSectionOption('Merge options', mergeOptions),
  getSectionOption('General options', generalOptions),
];

function getSectionOption(title, optionDef) {
  return {
    header: title,
    content: optionDef.map(o => ({ 
      a: '-' + o.alias, 
      b: '--' + o.name + ' ' + (o.arg || ''), 
      c: o.desc })),
  };
}

// parse options
const opts = cla(optionDefs);
// console.log(opts);

// handle errors
if (!opts.split && !opts.merge) {
  exitErr('Please specify an action: -s (split) or -m (merge).');
} 

function exitErr(str) {
  const errorSection = {
    'header': 'Error',
    'content': str,
  };
  help.push(errorSection);
  console.log(clu(help));
  process.exit(-1);
}

// show help
if (opts.help) {
  console.log(clu(help));
  process.exit(0);
}

if (opts.split) {
  // track the processed file
  const filenames = [];

  const filepath = opts.split
  fs.readFile(filepath, function (err, data) {
    if (err) {
      return console.error(err);
    }
    var lines = data.toString().split("\n");
    // determine the input type
    var type = "ndjson";
    // Note: The comma at the end of the line is optional. I assume the format
    // is [{object}],\n[{object}],\n[{object}]\EOF
    if (lines[0].match(/[[]]*],?/)) {
      // it's the JSON-style format [<json>],
      type = "json";
    }
    var out = "";
    for (var i = 0; i < lines.length; i++) {
      if (lines[i].trim() == "") {
        continue;
      }
      var json;
      if (type == "ndjson") {
        json = JSON.parse(lines[i]);
      }
      else if (type == "json") {
        json = JSON.parse(lines[i].match(/[([]]*)],?/)[1]);
      }

      const nameKey = opts['name-key'];
      const pathKey = opts['path-key'];

      if (!nameKey) {
        exitErr("Please specify the name-key.");
      }

      const filename = json[nameKey];
      const filepath = json[pathKey] || '';

      if (opts['omit-name']) {
        delete json[nameKey];       
      }
      if (opts['omit-path']) {
        delete json[pathKey];
      }

      const outfile = getOutputPath(filepath) + "/" + filename + ".json";

      // truncate if this is the first time writing to this file
      // and not appending 
      let truncate = false;
      if (!opts.append && !filenames.includes(outfile)) {
        truncate = true;
        filenames.push(outfile);
      }

      const outKey = opts['out-key'];
      if (outKey) {
        // add it to the array on out-key
        let obj;
        if (!truncate && fs.existsSync(outfile)) {
          try {
            obj = JSON.parse(fs.readFileSync(outfile));
          }
          catch(x) {
            if (x instanceof SyntaxError) {
              console.log("\nError:\n  A file exists with the same name but not in a valid JSON format.\n\  Perhaps it's the result of previous operation?\n\  Please delete the file or specify another output-dir.\n");                  
            }
            else {
              console.log(x);
            }
            process.exit(-1);
          }
        }
        else {
          obj = { [outKey]: [] };
        }
        obj[outKey].push(json);
        fs.writeFileSync(outfile, JSON.stringify(obj));
      }
      else {
        const data = JSON.stringify(json) + "\n";

        if (truncate) {
          fs.writeFileSync(outfile, data);
        }
        else {
          fs.appendFileSync(outfile, data);
        }
      }
    }
  });
}
else if (opts.merge) {
  const mergeDir = opts.merge;
  var data;
  // get the desired output format from the user
  getFormat(function (format) {
    if (Number(format) == 3 && !opts.index) {
      console.log("You forgot to declare an index (e.g.- pid) at EOL, run script again.");
      process.exit();
    }
    var index = opts.index;
    var mergedString = "";
    var items = fs.readdirSync(mergeDir);
    for (var i = 0; i < items.length; i++) {
      if (items[i].endsWith(".json")) {
        data = fs.readFileSync(mergeDir + '/' + items[i], "utf8");
        for (var a in data.toString().split("\n")) {
          var item = data.toString().split("\n")[a];
          if (item != "") {
            switch (Number(format)) {
              case 1: // minified JSON
                mergedString = mergedString + "[" + item + "],\n";
                break;
              case 2: // NDJSON
                mergedString += item + "\n";
                break;
              case 3: // ESJSON
                mergedString += '{"index":{"_id":"' +
                  JSON.parse(item)[index] +
                  '"}}\n' +
                  item +
                  "\n";
                break;
              default:
                break;
            }
          }
        }
      }
    }
    const filename = opts['merge-output'];
    if (!filename) {
      exitErr('Please specify merge-output file.');
    }

    const filepath = path.join(getOutputPath(), filename); 

    var writeStream = fs.createWriteStream(filepath);
    writeStream.write(mergedString);
    writeStream.end();
    writeStream.on("finish", function () {
      process.exit();
    });
  });
}
else {
  console.log("Please provide a correct action");
}

// function to use recursion to simulate syncronous access to stdin/out
function getFormat(callback) {
  process.stdout.write(
    "Select output format: 1:minified JSON, 2: NDJSON, 3:ESJSON: "
  );
  process.stdin.setEncoding('utf8');
  process.stdin.once('data', function (val) {
    // check validity of input
    if (!isNaN(val) && 0 < Number(val) < 3) {
      callback(val);
    }
    else {
      // if input is invalid, ask again
      getFormat(callback);
    }
  }).resume();
}

function mkDir(dir) {
  return dir.split('/').reduce((path, folder) => {
    path = path + '/' + fixName(folder);
    if (!fs.existsSync(path)) {
      fs.mkdirSync(path);
    }
    return path;
  }, '');
}

function fixName(name) {
  return name.replace(/\s+/g, '_');  
}

function getOutputPath(dir='') {
  return mkDir(path.resolve(path.join(
    opts['output-dir'], 
    dir)));
}
If this is a large enough dataset to require ElasticSearch, you shouldn't be building up a string in memory with mergedString. You'll quickly run out of RAM and crash your process.
slang800 9 days ago
I thought of rewriting the script but then decided to do just what the owner asked -- add functionality with minimal modification to the original script.
farolanf 9 days ago
farolanf, I was unable to successfully run your script... module.js:472 throw err; ^ Error: Cannot find module 'command-line-usage' at Function.Module.resolveFilename (module.js:470:15) at Function.Module.load (module.js:418:25) at Module.require (module.js:498:17) at require (internal/module.js:20:19) at Object. (/Users/propertydb/places/script-farolanf.js:3:13) at Module.compile (module.js:571:32) at Object.Module.extensions..js (module.js:580:10) at Module.load (module.js:488:32) at tryModuleLoad (module.js:447:12) at Function.Module._load (module.js:439:3)
ericjarvies 9 days ago
Please install the dependencies. npm i -S command-line-args command-line-usage
farolanf 9 days ago
After 'npm install -g command-line-args command-line-usage' I ran the script, and it errors as follows; `path.js:1142 cwd = process.cwd(); ^ Error: ENFILE: file table overflow, uv_cwd at Object.resolve (path.js:1142:25) at getOutputPath (/Users/propertydb/places/script-farolanf.js:244:21) at /Users/propertydb/places/script-farolanf.js:146:9 at FSReqWrap.readFileAfterClose [as oncomplete] (fs.js:416:3)`
ericjarvies 9 days ago
@farolanf - Whilst I don't mind installing dependancies in order to make the script work, I can't install them into the working directories (e.g.- node_modules folder and sub-folders). Dependancies need to be installed globally (e.g.- npm install -g command-line-args command-line-usage). With your above script, it does not work when dependencies are installed globally. When they are installed into the working folder, your script creates the folders and the files, but does not populate them with any data, and the script throws an error.
ericjarvies 9 days ago
It seems it hits open file limit, try increasing the max open file limit, what is the OS?
farolanf 9 days ago
@farolanf - here is a file containing some sample records to split; https://gofile.me/6tCLb/TeBQvuBZP
ericjarvies 9 days ago
I am running the script on latest OS X, on a RAM drive that averages ~700MB read/write speeds, with 64GB of RAM. Your script creates the folders and files, but it does not write the records to the files (each of the files contains 'UNDEFINED.' How can your script be run without having to install node_modules/ in the working directory?
ericjarvies 9 days ago
I ran the script against the sample successfully. Here's the result. https://postimg.org/gallery/pizz74y4/
farolanf 9 days ago
Ok, I created a limit.maxfiles.plist file and increased my soft and hard limits, and re-ran the script, and this time it populated the files with the records, so that addressed that problem.
ericjarvies 9 days ago
I'll create a package for the script so you can install it globally.
farolanf 9 days ago
Please use the updated spmer.js script, it contains updates for global install. I renamed it to spmer to avoid name clash.
farolanf 9 days ago
@farolanf - regarding Task #1, how do I use the -k flag to output records to files so that they are prefixed with; {"userAddressKeyword": [ and suffixed with ]}, meaning a normal JSON file instead \n JSON?
ericjarvies 9 days ago
spmer -s ~/Downloads/sample_records.ndjson -n filename -p folder -tu -k userAddressKeyword
farolanf 9 days ago
@farolanf - When using the -k flag, if the folders/files already exist, the script errors. Without the -k flag, the script appends the existing files, instead of overwriting.
ericjarvies 9 days ago
@farolanf - When trying to output to a single folder (all files to same folder), and not using the -n, -p, -t, or -u flags, but just the -d flag, it fails.
ericjarvies 9 days ago
That's what the original script does -- it's appending instead of overwriting. Error when using the -k flag is caused by trying to append to existing file which don't have that key -- the file from previous process without the -k flag. I shall make it overwrite the existing files without the -k flag by default with an option to append.
farolanf 9 days ago
@farolanf - I used the -k file and created new folders/files. I then ran the -k flag again, and it failed. Regarding getting the script to work, npm link didn't seem to work, so I created the split-merge folder in .npm-packages/lib/node_modules/ and put the package.json and script.js files in it, ran npm install, and then I created symbolic link in .npm-packages/bin named split-merge which points back to .npm-packages/lib/node_modules/split-merge/script.js and this seems to have done the trick.
ericjarvies 9 days ago
@farolanf - At this point, I see only two problems that remain pertaining to the split functionality. 1.) When I run node split-merge -s splitMe.ndjson -d it fails, and when I run node split-merge -s splitMe.ndjson -d fooFolder it fails. So I am unable to output all files into a single folder 2.) When I run with -k flag, it does not overwrite -nor append- the previous files. When I run without -k flag, it does not overwrite, but it appends. Perhaps a flag to select either append or overwrite.
ericjarvies 9 days ago
The updated version will overwrites by default. The script needs -n key and -p key options set. The -d option need a folder name: -d folder.
farolanf 9 days ago
@farolanf - the problem with having to set the -p key is that it will make all of the folders/sub-folders. I need to have option to split all files into the same root folder (without sub-folders), e.g.- split-merge -s splitMe.ndjson -n filename -d fooFolder.
ericjarvies 9 days ago
You shouldn't need to increase the number of file descriptors on your system, you just need to be more careful about the order and number of FDs you open. For example, my script has no problem processing the sample file on a system with 2GB RAM & no RAM disk in about 7 seconds. This is why ETL routines are almost always written to be streaming.
slang800 8 days ago
Async version of appendFile should be causing too many open files as the new one starts before the previous files are closed. I updated to use the synchronous version.
farolanf 8 days ago
@farolanf - Can you please edit the script so that I can output files to a single folder, meaning when the -p flag is not used, it looks to the -d tag. And then if you could put in a flag for OVERWRITING or APPEND, then that will do it, I'll close/award you the bounty and we'll be done with this iteration.
ericjarvies 8 days ago
Sure. There's already an -a flag for always append behaviour, if not specified then it overwrites the files. Regarding npm link doesn't work, please check if /usr/bin/env exists and /usr/bin/node is nodejs executable.
farolanf 8 days ago
:facepalm: - The synchronous version will block the process until I/O is finished. Your problem isn't async I/O operations, it's flow control. Take a look at how you can do this with streams: https://github.com/slang800/bountify-split-json/blob/master/lib/split.coffee ... It's about 30 lines of actual logic. No building up strings in RAM, no synchronous I/O, and you can start processing even before you have all the data.
slang800 8 days ago
@farolanf - ok (re. -a flag). I installed Brew in my home directory, e.g.- PATH=~/homebrew/sbin:~/homebrew/bin:$PATH and used Brew to install NVM, e.g.- NVM_DIR=~/.nvm" and used NVM to install NPM, e.g.- ~/.npm-packages >> ~/.npmrc. /usr/bin/env exists, but node is here; ~/homebrew/bin/node
ericjarvies 8 days ago
It should work if node can be accessed globally, otherwise I don't have the solution at the moment. In case you missed it, I have updated the script to output to output-dir (-d) if -p not present. @slang800 - I'm digesting your code and the pump package, and thinking of adopting the semicolon less coding style and functional programming.
farolanf 8 days ago
View Timeline