Add minified JSON & ESJSON data formats to existing NDJSON-only .js script
New here? Learn about Bountify and follow @bountify to get notified of new bounties! x

This is a command line script that JOINS multiple NDJSON .json files into a single NDJSON .json file, and it also SPLITS a single NDJSON file into multiple/individual NDJSON .json files.

Currently accepts records that are newline and look like this;
{"pid":"002-418-762","value_total":1288000}

I need it to accept normal minified JSON that looks like this;
[{"pid":"002-418-762","value_total":1288000}],

And I need it to JOIN NDJSON or JSON to output into a single file like this;
{"index":{"_id":"002-418-762"}}
{"pid":"002-418-762","value_total":1288000}

So, in the end it will accept/process either NDJSON or minified JSON, and you output either NDJSON, minified JSON, or ESJSON.

ESJSON = ElasticSearch JSON, which basically means an ES index line + an NDJSON record line, totaling 2 lines per record.

Here is the existing script;

// SPLITTING
// example command - node propertydb-v3.js split splitting/split_source.json pid splitting/split_target/
// this takes an individual source file containing multiple records, and outputs each record to its own JSON file in the targer folder, using values from the pid colume to name the individual JSON files.
// MERGING
// example command - node propertydb-v3.js merge "merging/merge_source" "merging/merge_target.json"
// this takes individual documents within the source folder and combines them into a single NDJSON document

var fs = require("fs");
var action = process.argv[2];

if(action == "split"){
    var fs = require("fs");
    var filePath = process.argv[3];
    var key = process.argv[4];

    var trackingArr = {};
    // Asynchronous read
    fs.readFile(filePath, function (err, data) {
       if (err) {
          return console.error(err);
       }
       var lines = data.toString().split("\n");
       for(var i=0;i<lines.length;i++){
           if(lines[i].trim()==""){
               continue;
           }
           var json = JSON.parse(lines[i]);
           if(!trackingArr[json[key]]){
              trackingArr[json[key]] = ""; 
           }
           trackingArr[json[key]]+=lines[i]+"\n";
       }
       //console.log(Object.keys(trackingArr).length);
       for(key in trackingArr){
            //var writeStream = fs.createWriteStream(process.argv[5]+"/"+key+".json");
            //writeStream.write(trackingArr[key]);
            //writeStream.end();
            trackingArr[key] = trackingArr[key].replace(/\r?\n?[^\r\n]*$/, "");
            fs.writeFile(process.argv[5]+"/"+key+".json", trackingArr[key],  function(err) {

            });
       }
       //console.log(process.argv[5]);
    });
}else if(action == "merge"){
    var mergedString="", fileCount=0;
    fs.readdir(process.argv[3], function(err, items) {
        for (var i=0; i<items.length; i++) {
            if(items[i].endsWith(".json")){
               var data = fs.readFileSync(process.argv[3]+'/'+items[i], "utf8");
               fileCount++;
               mergedString+=data.toString()+"\n";
               if(fileCount==items.length){
                   mergedString=mergedString.replace(/(^[ \t]*\n)/gm,'');
                   mergedString = mergedString.replace(/\r?\n?[^\r\n]*$/, "");
                   var writeStream = fs.createWriteStream(process.argv[4]);
                   writeStream.write(mergedString);
                   writeStream.end();
               }
            }
        }
    });
}else{
    console.log("Please provide a correct action");
}

Here are some sample NDJSON records you can throw into a split_source.json file so you can run the SPLIT command to output individual .json files to a folder, after which you can use the JOIN command to join those individual files;

{"pid":"002-418-762","year_tax":2006,"value_land":908000,"value_imprv":380000,"value_total":1288000,"value_levy":7372.44}
{"pid":"002-418-762","year_tax":2007,"value_land":1225000,"value_imprv":380000,"value_total":1605000,"value_levy":7539.21}
{"pid":"002-418-762","year_tax":2008,"value_land":1590000,"value_imprv":375000,"value_total":1965000,"value_levy":7790.25}
{"pid":"002-418-762","year_tax":2009,"value_land":1590000,"value_imprv":375000,"value_total":1965000,"value_levy":8713.44}
{"pid":"002-418-762","year_tax":2010,"value_land":1673000,"value_imprv":439000,"value_total":2112000,"value_levy":9685.32}
{"pid":"002-418-762","year_tax":2011,"value_land":2045000,"value_imprv":477000,"value_total":2522000,"value_levy":10489.62}
{"pid":"002-418-762","year_tax":2012,"value_land":2735000,"value_imprv":509000,"value_total":3244000,"value_levy":11971.04}
{"pid":"002-418-762","year_tax":2013,"value_land":2823000,"value_imprv":493000,"value_total":3316000,"value_levy":12695.23}
{"pid":"002-418-762","year_tax":2014,"value_land":2716000,"value_imprv":477000,"value_total":3193000,"value_levy":13124.14}
{"pid":"002-418-762","year_tax":2015,"value_land":3155000,"value_imprv":480000,"value_total":3635000,"value_levy":14129}
{"pid":"002-429-993","year_tax":2006,"value_land":902000,"value_imprv":69500,"value_total":971500,"value_levy":5440.89}
{"pid":"002-429-993","year_tax":2007,"value_land":1095000,"value_imprv":69500,"value_total":1164500,"value_levy":5590.11}
{"pid":"002-429-993","year_tax":2008,"value_land":1220000,"value_imprv":71000,"value_total":1291000,"value_levy":5655.38}
{"pid":"002-429-993","year_tax":2009,"value_land":1220000,"value_imprv":71000,"value_total":1291000,"value_levy":6087.34}
{"pid":"002-429-993","year_tax":2010,"value_land":1344000,"value_imprv":72300,"value_total":1416300,"value_levy":6517.63}
{"pid":"002-429-993","year_tax":2011,"value_land":1613000,"value_imprv":83900,"value_total":1696900,"value_levy":7145.63}
{"pid":"002-429-993","year_tax":2012,"value_land":2206000,"value_imprv":58200,"value_total":2264200,"value_levy":8266.12}
{"pid":"002-429-993","year_tax":2013,"value_land":2200000,"value_imprv":47000,"value_total":2247000,"value_levy":8900.26}
{"pid":"002-429-993","year_tax":2014,"value_land":2231000,"value_imprv":44500,"value_total":2275500,"value_levy":9446.48}
{"pid":"002-429-993","year_tax":2015,"value_land":2464000,"value_imprv":65700,"value_total":2529700,"value_levy":10138.37}
{"pid":"002-435-373","year_tax":2006,"value_land":829000,"value_imprv":472000,"value_total":1301000,"value_levy":7557.19}
{"pid":"002-435-373","year_tax":2007,"value_land":1120000,"value_imprv":472000,"value_total":1592000,"value_levy":7493.19}
{"pid":"002-435-373","year_tax":2008,"value_land":1398000,"value_imprv":493000,"value_total":1891000,"value_levy":7644.62}
{"pid":"002-435-373","year_tax":2009,"value_land":1398000,"value_imprv":493000,"value_total":1891000,"value_levy":8386.26}
{"pid":"002-435-373","year_tax":2010,"value_land":1455000,"value_imprv":572000,"value_total":2027000,"value_levy":9257.2}
{"pid":"002-435-373","year_tax":2011,"value_land":1886000,"value_imprv":622000,"value_total":2508000,"value_levy":10154.92}
{"pid":"002-435-373","year_tax":2012,"value_land":2573000,"value_imprv":672000,"value_total":3245000,"value_levy":11742.53}
{"pid":"002-435-373","year_tax":2013,"value_land":2599000,"value_imprv":654000,"value_total":3253000,"value_levy":12498.69}
{"pid":"002-435-373","year_tax":2014,"value_land":2383000,"value_imprv":637000,"value_total":3020000,"value_levy":12736.11}
{"pid":"002-435-373","year_tax":2015,"value_land":2729000,"value_imprv":644000,"value_total":3373000,"value_levy":13103.73}
{"pid":"002-439-433","year_tax":2006,"value_land":1261000,"value_imprv":568000,"value_total":1829000,"value_levy":9975.21}
{"pid":"002-439-433","year_tax":2007,"value_land":1613000,"value_imprv":686000,"value_total":2299000,"value_levy":10491.28}
{"pid":"002-439-433","year_tax":2008,"value_land":1934000,"value_imprv":684000,"value_total":2618000,"value_levy":10584.75}
{"pid":"002-439-433","year_tax":2009,"value_land":1934000,"value_imprv":684000,"value_total":2618000,"value_levy":11418.81}
{"pid":"002-439-433","year_tax":2010,"value_land":1988000,"value_imprv":778000,"value_total":2766000,"value_levy":12401.6}
{"pid":"002-439-433","year_tax":2011,"value_land":2600000,"value_imprv":878000,"value_total":3478000,"value_levy":13732.3}
{"pid":"002-439-433","year_tax":2012,"value_land":3591000,"value_imprv":676000,"value_total":4267000,"value_levy":14841.07}
{"pid":"002-439-433","year_tax":2013,"value_land":3543000,"value_imprv":660000,"value_total":4203000,"value_levy":15923.23}
{"pid":"002-439-433","year_tax":2014,"value_land":3504000,"value_imprv":653000,"value_total":4157000,"value_levy":16589.67}
{"pid":"002-439-433","year_tax":2015,"value_land":3901000,"value_imprv":468000,"value_total":4369000,"value_levy":16640.94}
{"pid":"002-443-601","year_tax":2006,"value_land":498000,"value_imprv":114000,"value_total":612000,"value_levy":3795.39}
{"pid":"002-443-601","year_tax":2007,"value_land":698000,"value_imprv":115000,"value_total":813000,"value_levy":3999.27}
{"pid":"002-443-601","year_tax":2008,"value_land":851000,"value_imprv":104000,"value_total":955000,"value_levy":4116.2}
{"pid":"002-443-601","year_tax":2009,"value_land":851000,"value_imprv":104000,"value_total":955000,"value_levy":4628.06}
{"pid":"002-443-601","year_tax":2010,"value_land":875000,"value_imprv":105000,"value_total":980000,"value_levy":4960.08}
{"pid":"002-443-601","year_tax":2011,"value_land":954000,"value_imprv":96100,"value_total":1050100,"value_levy":5110.81}
{"pid":"002-443-601","year_tax":2012,"value_land":1171000,"value_imprv":83500,"value_total":1254500,"value_levy":5448.01}
{"pid":"002-443-601","year_tax":2013,"value_land":1138000,"value_imprv":81400,"value_total":1219400,"value_levy":5545.83}
{"pid":"002-443-601","year_tax":2014,"value_land":1135000,"value_imprv":105000,"value_total":1240000,"value_levy":5754.46}
{"pid":"002-443-601","year_tax":2015,"value_land":1378000,"value_imprv":101000,"value_total":1479000,"value_levy":6423.78}
So, for split, if the input is NDJSON, you want each line to be in a new file. If it's a single JSON array, you want it to make each array element into a new file. For merge, the input is a directory containing individual JSON files, and you want them in either NDJSON or a single minified JSON array. How do I determine which output format to use for merge? Is it up to me, or do I ask for a command line flag to determine the format?
Taiiwo 3 years ago
Currently, the script both joins and splits NDJSON as it should, wherein NDJSON rows look like this; Json {"pid":"002-443-601","value_total":1479000} But, it needs to also read and write lines that look like this (JSON); Json [{"pid":"002-443-601","value_total":1479000}], The above JSON format includes additional characters, specifically; the frontside bracket (** [** ) the rear side bracket ( ]) and the trailing comma ( , ) The script should simply detect which format is being input, and prompt the user which format is to be output (NDJSON or JSON?).
ericjarvies 3 years ago
Second, there needs to be one additional output, so the prompt asks the user; output to NDJSON, JSON, or ESJSON? In the case of ESJSON, a record is represented with 2 rows, like so; Json {"index":{"_id":"002-443-601"}} {"pid":"002-443-601","value_total":1479000} So, the script needs to add an additional leading row for each NDJSON or JSON record/row. Finally, whilst the bounty is only $10, I will tip another $40 to whomever helps me solve this, so thanks in advance.
ericjarvies 3 years ago
These need to succeed; SPLITTING 1. node script-v5.js split "records_JSON.json" pid "json_files/" 2. node script-v5.js split "records_NDJSON.json" pid "json_files/" MERGING individual .json files whose rows are in NDJSON format 3. node script-v5.js merge "json_files/" "records_merged_JSON_.json" pid 4. node script-v5.js merge "json_files/" "records_merged_NDJSON_.json" pid 5. node script-v5.js merge "json_files/" "records_merged_ESJSON_.json" pid MERGING individual .json files whose rows are in JSON format 6. node script-v5.js merge "json_files/" "records_merged_JSON_.json" pid 7. node script-v5.js merge "json_files/" "records_merged_NDJSON_.json" pid 8. node script-v5.js merge "json_files/" "records_merged_ESJSON_.json" pid
ericjarvies 3 years ago
awarded to Taiiwo

Crowdsource coding tasks.

1 Solution


Okay, try this out.

Usage: node script.js split [input_file] [key] [output_dir]
Where:
input_file: Large data file to get info from
key: object key where the value names the split files
output_dir: the directory in which the files will be placed

and
node script.js merge [input_dir] [output_file] [key]
where:
input_dir: directory where the files are located
output_file: the name of the file where you want the data to be saved
key: Optional, used as the primary key when outputting to ESJSON

examples:
split example.ndjson into files inside o/, using pid to determine their filenames
node script.js split example.ndjson pid o/
merge all the files in ./o/ into a file called output.ndjson using key "pid"
node script.js merge o output.ndjson pid

// SPLITTING
// example command - node propertydb-v3.js split splitting/split_source.json pid splitting/split_target/
// this takes an individual source file containing multiple records, and outputs each record to its own JSON file in the targer folder, using values from the pid colume to name the individual JSON files.
// MERGING
// example command - node propertydb-v3.js merge "merging/merge_source" "merging/merge_target.json" index
// this takes individual documents within the source folder and combines them into a single NDJSON document

var fs = require("fs");
var action = process.argv[2];

if(action == "split"){
  var filePath = process.argv[3];
  var key = process.argv[4];
  // Asynchronous read
  fs.readFile(filePath, function (err, data) {
    if (err) {
      return console.error(err);
    }
    var lines = data.toString().split("\n");
    // determine the input type
    var type = "ndjson";
    // Note: The comma at the end of the line is optional. I assume the format
    // is [{object}],\n[{object}],\n[{object}]\EOF
    if (lines[0].match(/\[[^\]]*\],?/)) {
      // it's the JSON-style format `[<json>],`
      type = "json";
    }
    var out = "";
    for (var i = 0; i < lines.length; i++) {
      if (lines[i].trim() == "") {
        continue;
      }
      var json;
      if (type == "ndjson"){
        json = JSON.parse(lines[i]);
      }
      else if (type == "json") {
        json = JSON.parse(lines[i].match(/\[([^\]]*)\],?/)[1]);
      }
      fs.appendFile(
        process.argv[5] + "/" + json[key] + ".json",
        JSON.stringify(json) + "\n",
        function(){}   // supresses warning
      );
    }
  });
}
else if (action == "merge") {
  var data;
  // get the desired output format from the user
  getFormat(function(format){
    if (Number(format) == 3 && process.argv.length < 6){
      console.log("For ESJSON, you need to supply and index field.");
      process.exit();
    }
    var index = process.argv[5];
    var mergedString = "";
    var items = fs.readdirSync(process.argv[3]);
    for (var i = 0; i < items.length; i++) {
      if (items[i].endsWith(".json")){
        data = fs.readFileSync(process.argv[3] + '/' + items[i], "utf8");
        for (var a in data.toString().split("\n")) {
          var item = data.toString().split("\n")[a];
          if (item != ""){
            switch (Number(format)) {
              case 1:   // minified JSON
                mergedString = mergedString + "[" + item + "],\n";
                break;
              case 2:   // NDJSON
                mergedString += item + "\n";
                break;
              case 3:   // ESJSON
                mergedString += '{"index":"_id":' +
                                JSON.parse(item)[index] +
                                '}\n' +
                                item +
                                "\n";
                break;
              default:
                break;
            }
          }
        }
      }
    }
    var writeStream = fs.createWriteStream(process.argv[4]);
    writeStream.write(mergedString);
    writeStream.end();
    writeStream.on("finish", function(){
      process.exit();
    });
  });
}
else {
    console.log("Please provide a correct action");
}

// function to use recursion to simulate syncronous access to stdin/out
function getFormat(callback){
  process.stdout.write(
    "Select output format: 1:minified JSON, 2: NDJSON, 3:ESJSON: "
  );
  process.stdin.setEncoding('utf8');
  process.stdin.once('data', function(val){
    // check validity of input
    if (!isNaN(val) && 0 < Number(val) < 3){
      callback(val);
    }
    else {
      // if input is invalid, ask again
      getFormat(callback);
    }
  }).resume();
}
Alright then, I gave your revisions a test drive, and apart from only a couple of oversights, all seems well. Great job! Please watch the video and respond if you have any questions, otherwise I'll await those revisions; https://www.dropbox.com/s/4qejg325ds56tka/split-merge-script-v4-by-Taiiwo.mp4?dl=0
ericjarvies 3 years ago
You're totally right! I updated the code in my solution to fix those issues.
Taiiwo 3 years ago
Ok, I saved your latest script into a file named script-v5.js and attempted to split and merge, but both failed... as seen in the following video; https://www.dropbox.com/s/c6syxx2kr25sxr4/split-merge-script-v5-by-Taiiwo.mp4?dl=0
ericjarvies 3 years ago
I changed this back to the previous iteration; json = JSON.parse(lines[i].slice(1,-2)); whilst keeping this change; '}\n' + Allowing the script to output a correctly formatted ESJSON.js file (2 lines per record as per ES _bulk API), but the other problem still remains, which is the input of JSON formatted rows.
ericjarvies 3 years ago
My bad! My dev env broke with a new update and I thought I could make your changes without testing them first. How unprofessional of me. I've now updated the code in the solution and tested every possible combination of splits and merges. Sorry for any inconvenience.
Taiiwo 3 years ago
FYI - I just noticed an omission in the ESJSON output... the first of the two rows is malformed, it currently looks like this; {"index":"_id":002-418-132} but it should look like this; {"index":{"_id":"002-418-132"}}. I made the corrections. Regarding the COMMA at the end of the merged JSON.json file... how can we omit that last comma?
ericjarvies 3 years ago