Parsing Microsoft Office Open XML document properties with Node.js

Posted on

Recently, I built an Algolia search index for a large repository of Microsoft Office documents. Algolia provides a hosted search platform that makes it easy to implement search within websites and mobile applications. It’s especially good for searching semi-structured data and returning relevant results quickly.

You can use tools like textract or Apache Tika to extract text from Microsoft Office documents. If you want to also include the standard document properties in your index, you can use the office-document-properties node module to read document properties from Microsoft Office Open XML documents (docx, docm, pptx, pptm, xlsx, xlsm).

To get started, first install the module using the npm install command:

$ npm install office-document-properties
+ office-document-properties@1.0.4
added 9 packages in 1.382s

To read document properties from a file, import the module and use the fromFilePath method.

// Import the module.
var getDocumentProperties = require('office-document-properties')

// Read document properties from file path.
getDocumentProperties.fromFilePath('MyFile.docx', function(err, data){
  if (err) throw err;
  console.log(data);
  /*
  {
    application: 'Microsoft Macintosh Word',
    applicationVersion: '15.0000',
    characters: 20,
    comments: 'This is an example document for testing the retrieval of document properties.',
    company: 'Acme',
    created: '2017-09-06T17:32:00Z',
    createdBy: 'Microsoft Office User',
    keywords: 'Example Sample Test Properties',
    manager: 'John Doe',
    modified: '2017-10-13T19:26:00Z',
    modifiedBy: 'Microsoft Office User',
    pages: 1,
    paragraphs: 1,
    revision: 5,
    subject: 'Example',
    template: 'Normal.dotm',
    title: 'Test Word Document',
    totalTime: 8,
    words: 5
  }
  */
})

Read document properties from a buffer using the fromBuffer method:

// Import the module.
var getDocumentProperties = require('office-document-properties')

// Have a buffer.
var fs = require('fs'), fileBuff = fs.readFileSync('MyFile.pptx');

// Read document properties from a buffer.
getDocumentProperties.fromBuffer(fileBuff, function(err, data) {
  if (err) throw err;
  console.log(data);
  /*
  {
    application: 'Microsoft Macintosh PowerPoint',
    applicationVersion: '15.0027',
    comments: 'This is an example document for testing the retrieval of document properties.',
    company: 'Acme',
    created: '2017-10-13T15:54:58Z',
    createdBy: 'Microsoft Office User',
    keywords: 'Example Sample Test Properties',
    manager: 'Jane Doe',
    modified: '2017-10-13T19:27:15Z',
    modifiedBy: 'Microsoft Office User',
    notes: 0,
    paragraphs: 2,
    revision: 3,
    slides: 1,
    subject: 'Example',
    title: 'Test PowerPoint Document',
    totalTime: 2,
    words: 8
  }
  */
})

Hope that you find this module useful for your data indexing needs – be sure to visit the GitHub project to report any issues you encounter.