<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Cascading</title>
	<atom:link href="http://www.cascading.org/feed/" rel="self" type="application/rss+xml" />
	<link>http://www.cascading.org</link>
	<description>Application Platform for Enterprise Big Data</description>
	<lastBuildDate>Thu, 23 May 2013 21:01:57 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.5.1</generator>
		<item>
		<title>Cascading Pattern &#8211; Machine Learning for Cascading and Hadoop</title>
		<link>http://www.cascading.org/2013/05/21/cascading-pattern-machine-learning-for-cascading-and-hadoop/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=cascading-pattern-machine-learning-for-cascading-and-hadoop</link>
		<comments>http://www.cascading.org/2013/05/21/cascading-pattern-machine-learning-for-cascading-and-hadoop/#comments</comments>
		<pubDate>Tue, 21 May 2013 16:23:45 +0000</pubDate>
		<dc:creator>concurrent</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=247</guid>
		<description><![CDATA[<p>Announcing Pattern, a new library and framework that executes PMML workflows as Cascading applications on Apache Hadoop clusters. Read more about it on the Pattern project page, signup for announcements on the mail list, or read the press release.</p>
]]></description>
				<content:encoded><![CDATA[<p>Announcing Pattern, a new library and framework that executes <a href="http://en.wikipedia.org/wiki/Predictive_Model_Markup_Language">PMML</a> workflows as Cascading applications on Apache Hadoop clusters.</p>
<p>Read more about it on the <a href="http://www.cascading.org/pattern">Pattern</a> project page, signup for announcements on the <a href="https://groups.google.com/forum/?fromgroups#!forum/pattern-user">mail list</a>, or read the <a href="http://www.concurrentinc.com/posts/2013/05/21/concurrent-completes-the-big-data-hat-trick-for-hadoop-applications/">press release</a>.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2013/05/21/cascading-pattern-machine-learning-for-cascading-and-hadoop/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Lingual Public Access</title>
		<link>http://www.cascading.org/2013/04/08/lingual-public-access/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=lingual-public-access</link>
		<comments>http://www.cascading.org/2013/04/08/lingual-public-access/#comments</comments>
		<pubDate>Mon, 08 Apr 2013 18:57:54 +0000</pubDate>
		<dc:creator>concurrent</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=225</guid>
		<description><![CDATA[<p>Lingual is now available for download or build. See the Lingual page for details, or visit the Lingual project page.</p>
]]></description>
				<content:encoded><![CDATA[<p>Lingual is now available for download or build. See the <a href="http://www.cascading.org/lingual/">Lingual page</a> for details, or visit the <a href="https://github.com/Cascading/lingual">Lingual project page</a>.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2013/04/08/lingual-public-access/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Lingual&#8217;s Architecture</title>
		<link>http://www.cascading.org/2013/02/27/linguals-architecture/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=linguals-architecture</link>
		<comments>http://www.cascading.org/2013/02/27/linguals-architecture/#comments</comments>
		<pubDate>Wed, 27 Feb 2013 16:16:29 +0000</pubDate>
		<dc:creator>concurrent</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=223</guid>
		<description><![CDATA[<p>Julian Hyde discusses how Optiq and Cascading work together to become Lingual.</p>
]]></description>
				<content:encoded><![CDATA[<p>Julian Hyde discusses how <a href="http://julianhyde.blogspot.com/2013/02/announcing-lingual.html">Optiq and Cascading work together</a> to become <a href="http://www.cascading.org/lingual/">Lingual</a>.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2013/02/27/linguals-architecture/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Cascading Lingual &#8211; True SQL for Cascading and Hadoop</title>
		<link>http://www.cascading.org/2013/02/20/cascading-lingual-true-sql-for-cascading-and-hadoop/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=cascading-lingual-true-sql-for-cascading-and-hadoop</link>
		<comments>http://www.cascading.org/2013/02/20/cascading-lingual-true-sql-for-cascading-and-hadoop/#comments</comments>
		<pubDate>Wed, 20 Feb 2013 13:55:13 +0000</pubDate>
		<dc:creator>concurrent</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=219</guid>
		<description><![CDATA[<p>Announcing Lingual, a new framework that executes ANSI SQL queries as Cascading applications on Apache Hadoop clusters. Read more about it on the Lingual project page, signup for announcements on the mail list, or read the press release.</p>
]]></description>
				<content:encoded><![CDATA[<p>Announcing <strong>Lingual</strong>, a new framework that executes ANSI SQL queries as Cascading applications on Apache Hadoop clusters.</p>
<p>Read more about it on the <a href="http://www.cascading.org/lingual">Lingual project page</a>, signup for announcements on the <a href="https://groups.google.com/forum/?fromgroups#!forum/lingual-user">mail list</a>, or read the <a href="http://www.concurrentinc.com/posts/2013/02/20/introducing-lingual-open-source-ansi-sql-for-hadoop/">press release</a>.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2013/02/20/cascading-lingual-true-sql-for-cascading-and-hadoop/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Cascading 2.2 WIP and CoercibleTypes</title>
		<link>http://www.cascading.org/2013/01/02/cascading-2-2-wip-and-coercibletypes/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=cascading-2-2-wip-and-coercibletypes</link>
		<comments>http://www.cascading.org/2013/01/02/cascading-2-2-wip-and-coercibletypes/#comments</comments>
		<pubDate>Wed, 02 Jan 2013 21:38:58 +0000</pubDate>
		<dc:creator>concurrent</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=212</guid>
		<description><![CDATA[<p>Cascading 2.2 is starting to take shape for those interested in test driving emerging features. Of note is &#8220;field type&#8221; support. This allows fields read from an input file to have type information retained through to where the data is sinked/stored to a file. This&#8230; <a class="more-link" href="http://www.cascading.org/2013/01/02/cascading-2-2-wip-and-coercibletypes/">Continue reading <span class="meta-nav">&#8594;</span></a></p>
]]></description>
				<content:encoded><![CDATA[<p>Cascading 2.2 is starting to take shape for those interested in test driving emerging features.</p>
<p>Of note is &#8220;field type&#8221; support. This allows fields read from an input file to have type information retained through to where the data is sinked/stored to a file.</p>
<p>This is important for a few reasons: </p>
<ul>
<li>Detecting incompatible comparisons during joins and sorting at planner time</li>
<li>Retain canonical types in a Tuple</li>
<li>Reading and writing field type information from/into long term archive files (Avro, Thrift, etc)</li>
<li>Reducing intermediate file size by guaranteeing field type information
<li>
<li>Custom type coercion via CoercibleType interface</li>
</ul>
<p>The <a href="http://docs.concurrentinc.com/cascading/2.2/javadoc/cascading/tuple/type/CoercibleType.html">CoercibleType</a> interface is of particular importance. </p>
<p>Consider reading a CSV file with a date column, like <code>28/Dec/2012:16:17:12:931 -0800</code>.</p>
<p>Internally date information is best handled as a <code>long</code> timestamp. But when externalized as a String, it should read as a date string, not a stringified long value.</p>
<p>The <a href="http://docs.concurrentinc.com/cascading/2.2/javadoc/cascading/tuple/type/DateType.html">DateType</a> implementation of CoercibleType can be used when declaring the date field. Given the correct string date format string, the value of the date field will be stored as its canonical type, <code>long</code>. </p>
<p>So if an Operation or sink Scheme wants the value as a string, by calling <code>tupleEntry.getString("date")</code>, it will be automatically converted back to the proper date string.</p>
<p>Or to store a long value of the date string, the code can call <code>tupleEntry.setString("date", "28/Dec/2012:16:17:12:931 -0800")</code>, resulting in<br />
<code>tupleEntry.getObject("date") instanceof Long</code> is <code>true</code>. </p>
<p>CoercibleType isn&#8217;t a replacement for data-cleansing code that can handle contingencies in the data, but for data that is known to be clean, even data emitted from prior Cascading Flows, it is quite handy.</p>
<p>This opens up the door for more complex types that may have multiple representations. Consider a hypothetical <code>Person</code> object that can be serialized as binary to disk, but has a JSON String representation, or has a Map Object in memory/runtime representation. </p>
<p>See <a href="http://conjars.org/cascading/cascading-core">conjars</a> or the <a href="http://www.concurrentinc.com/downloads/">Concurrent site</a> for downloads.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2013/01/02/cascading-2-2-wip-and-coercibletypes/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Cascading 2.1</title>
		<link>http://www.cascading.org/2012/10/30/cascading-2-1/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=cascading-2-1</link>
		<comments>http://www.cascading.org/2012/10/30/cascading-2-1/#comments</comments>
		<pubDate>Tue, 30 Oct 2012 17:04:27 +0000</pubDate>
		<dc:creator>concurrent</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=211</guid>
		<description><![CDATA[<p>We are happy to announce that Cascading 2.1 is now publicly available for download. http://www.cascading.org/downloads/ This release includes a number of new features. Specifically: &#8211; Restartable Flows using Checkpointing &#8211; Improved memory utilization and gc &#8211; Refactored build system, source and javadoc jars now available&#8230; <a class="more-link" href="http://www.cascading.org/2012/10/30/cascading-2-1/">Continue reading <span class="meta-nav">&#8594;</span></a></p>
]]></description>
				<content:encoded><![CDATA[<p>We are happy to announce that Cascading 2.1 is now publicly available for download.</p>
<p><a href="http://www.cascading.org/downloads/">http://www.cascading.org/downloads/</a></p>
<p>This release includes a number of new features. Specifically:</p>
<p>- Restartable Flows using Checkpointing<br />
- Improved memory utilization and gc<br />
- Refactored build system, source and javadoc jars now available through <a href="http://conjars.org/">conjars.org</a>
</p>
<p>For more details see:</p>
<p><a href="https://github.com/Cascading/cascading/blob/2.1/CHANGES.txt">https://github.com/Cascading/cascading/blob/2.0/CHANGES.txt</a></p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2012/10/30/cascading-2-1/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Cascading for the Impatient, Part 6</title>
		<link>http://www.cascading.org/2012/08/07/cascading-for-the-impatient-part-6/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=cascading-for-the-impatient-part-6</link>
		<comments>http://www.cascading.org/2012/08/07/cascading-for-the-impatient-part-6/#comments</comments>
		<pubDate>Tue, 07 Aug 2012 20:22:04 +0000</pubDate>
		<dc:creator>Paco Nathan</dc:creator>
				<category><![CDATA[Impatient]]></category>
		<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=192</guid>
		<description><![CDATA[<p>In our fifth installment of this series we showed how to implement TF-IDF in Cascading application. If you haven’t read that yet, it’s probably best to start there. Today&#8217;s post extends the TF-IDF app to show best practices for test-driven development (TDD) at scale. We’ll&#8230; <a class="more-link" href="http://www.cascading.org/2012/08/07/cascading-for-the-impatient-part-6/">Continue reading <span class="meta-nav">&#8594;</span></a></p>
]]></description>
				<content:encoded><![CDATA[<p>In our <a href="http://www.cascading.org/2012/07/31/cascading-for-the-impatient-part-5/">fifth installment of this series</a> we showed how to implement <a href="http://en.wikipedia.org/wiki/Tf*idf">TF-IDF</a> in <a href="http://www.cascading.org/">Cascading</a> application. If you haven’t read that yet, it’s probably best to start there.</p>
<p>Today&#8217;s post extends the TF-IDF app to show best practices for <a href="http://en.wikipedia.org/wiki/Test-driven_development">test-driven development</a> (TDD) at scale. We’ll incorporate unit tests into the build (should have done so sooner), plus show how to leverage TDD features which are unique to Cascading: checkpoints, traps, assertions, etc. These features are based on using <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/Checkpoint.html">Checkpoint</a>, <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/Debug.html">Debug</a>, and <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/assertion/AssertMatches.html">AssertMatches</a>.</p>
<p>We’ll keep building on this example to show how to leverage <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/flow/local/package-summary.html">“local mode”</a>.</p>
<h2>Theory</h2>
<p>At first glance, the notion of <a href="http://en.wikipedia.org/wiki/Test-driven_development">test-driven development</a> (TDD) might seem a bit antithetical in the context of Big Data. After all, TDD is all about short development cycles, writing automated test cases which are intended to fail, and lots of refactoring. Those descriptions would not appear to fit with batch jobs involving terabytes of data and huge clusters running apps that take days to complete.</p>
<p>Stated in a different way, according to Kent Beck, TDD “encourages simple designs and inspires confidence.” That statement does actually fit well with Cascading. The API is intended to provide simple design patterns for working with data – <em>GroupBy</em>, <em>Join</em>, <em>Count</em>, <em>Regex</em>, <em>Filter</em> – so that the need for writing custom functions becomes relatively rare. That speaks to “encouraging simple designs” directly. The practice in Cascading of modeling business process and orchestrating MapReduce workflows – that speaks to “inspiring confidence” in a big way.</p>
<p>So now we’ll let the cat out of the bag for a little secret&#8230; Working with unstructured data at scale has been shown to be quite valuable (Google, Amazon, LinkedIn, Twitter, etc.) however most of the “heavy lifting” which we perform in MapReduce workflows is essentially cleaning up data. DJ Patil explained this point quite eloquently in <a href="http://radar.oreilly.com/2012/07/data-jujitsu.html">Data Jujitsu</a>: <em>“It’s impossible to overstress this: 80% of the work in any data project is in cleaning the data &#8230; Work done up front in getting clean data will be amply repaid over the course of the project.”</em> </p>
<p>Cleaning up the data allows for subsequent use of <a href="http://en.wikipedia.org/wiki/Sampling_(statistics)">sampling techniques</a>, <a href="http://en.wikipedia.org/wiki/Dimension_reduction">dimensional reduction</a>, and other practices which help alleviate some of the bottlenecks which might otherwise be encountered in Big Data. In other words, there are great use cases for formalisms which help demonstrate that “dirty” data at scale has been cleaned up. Those turn out to be quite valuable in practice. </p>
<p>However, TDD practices tend to be based on <a href="http://en.wikipedia.org/wiki/Unit_test">unit tests</a> or <a href="http://en.wikipedia.org/wiki/Mock_object">mocks</a> &#8230; how does one write a quick unit test for a Godzilla-sized dataset?</p>
<p>The short answer is: you don’t. However, you can greatly reduce the <em>need</em> for writing unit test coverage by limiting the amount of custom code required. Hopefully we’ve shown that aspect of Cascading by now. Beyond that aspect, you can use sampling techniques to quantify the confidence for an app running correctly. You can also define system tests at scale in relatively simple ways. Furthermore, you can define contingencies for <em>what to do when assumptions fail</em> &#8230; as they inevitably do, at scale.</p>
<p>Let’s discuss sampling… generally speaking, large MapReduce workflows are relatively opaque processes which are difficult to observe. However, Cascading provides two techniques for observing portions of a workflow. One very simple approach is to insert a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/Debug.html">Debug</a> into a pipe, to see the tuple values passing through a particular part of a workflow. Debug output goes to the log instead of a file, but it can be turned off, e.g., with a command line option. If the data is large, one can use a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/filter/Sample.html">Sample</a> filter to sample the tuple values which get written to the log.</p>
<p>Another approach is to use a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/Checkpoint.html">Checkpoint</a>, which forces intermediate data to be written out to HDFS. This may also become important for performance reasons, i.e., forcing results to disk to avoid recomputing – e.g., when there are multiple uses for the output of a pipe downstream such as with the right side of a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/HashJoin.html">HashJoin</a>. Sampling may be performed either before (like with Debug) or after the data gets persisted to HDFS.</p>
<p>Next, let’s talk about system tests. Cascading include support for <a href="http://docs.cascading.org/cascading/2.0/userguide/htmlsingle/#N212F4">stream assertions</a>. These provide mechanisms for asserting that the values in a tuple stream meet certain criteria – similar to the <code>assert</code> keyword in Java, or an <code>assert not null</code> in a unit test. We can assert patterns <em>strictly</em> as unit tests during development, then run testing against regression data. For performance reasons, we might use command line options to turn off assertions in production. Or keep them, if a use case requires that level of guarantees.</p>
<p>Lastly, what to do when assumptions fail? One lesson of working with data at scale is that the best assumptions will inevitably fail. Unexpected things happen, and 80% of the work will be cleaning up problems. Cascading defines <a href="http://docs.cascading.org/cascading/2.0/userguide/htmlsingle/#N21360">failure traps</a> which capture data that causes an Operation to fail, e.g., throw an Exception. For example, perhaps 99% of the cases in your log files can be rolled up into a set of standard reports&#8230; but 1% requires manual review. Great, process the 99% which work and shunt the 1% failure cases into a special file, marked “for manual review”. Keep in mind, however, that traps are intended for handling exceptional cases. If you know in advance how to categorize good vs. bad data, then use a <a href="http://docs.cascading.org/cascading/2.0/userguide/html/ch10s09.html">filter instead of a trap</a>.</p>
<p>Meanwhile, a conceptual diagram for this implementation of TF-IDF in Cascading is shown as:</p>
<p><a href="http://www.cascading.org/files/2012/08/plumb6.png"><img src="http://www.cascading.org/files/2012/08/plumb6-150x150.png" alt="" title="Conceptual Diagram - &quot;Impatient&quot;, Part 6" width="150" height="150" class="aligncenter size-thumbnail wp-image-206" /></a></p>
<h2>Source</h2>
<p>Download source for this example on <a href="https://github.com/Cascading/Impatient/tree/master/part6">GitHub</a>. You’ll need to clone the whole of this multi-part series:</p>
<pre><code>git clone git://github.com/Cascading/Impatient.git</code></pre>
<p>For quick reference, the source code and a log for this example are listed in a <a href="https://gist.github.com/3044049">gist</a>. The input data stays the same as in the <a href="https://gist.github.com/2911686">earlier code</a>.</p>
<p>Let’s add a unit test and show how that works into this example. In the <a href="http://www.gradle.org/">Gradle</a> build script <code>build.gradle</code> we need to modify the <code>compile</code> task to include <a href="http://www.junit.org/">JUnit</a> and other testing dependencies:</p>
<p><code></p>
<pre>compile( 'cascading:cascading-hadoop:2.0.1' ) { transitive = true }
testCompile( 'org.apache.hadoop:hadoop-test:1.0.3' )
testCompile( 'junit:junit:4.8.+' )</pre>
<p></code></p>
<p>Then we’ll add a <code>test</code> task:</p>
<p><code></p>
<pre>test {
  include 'impatient/**'
  //makes the standard streams (err and out) visible at console when running tests
  testLogging.showStandardStreams = true
  //listening to test execution events
  beforeTest { descriptor -&gt;
     logger.lifecycle(&quot;Running test: &quot; + descriptor)
  }
  onOutput { descriptor, event -&gt;
     logger.lifecycle(&quot;Test: &quot; + descriptor + &quot; produced standard out/err: &quot; + event.message )
  }
}</pre>
<p></code></p>
<p>A little restructuring of the source directories is requried – see our <a href="https://github.com/Cascading/Impatient/tree/master/part6">GitHub code repo</a>, where it’s all set up property. Then we add a unit test for our custom function to “scrub” tokens, which was created in <em>Part 3</em>. This goes into a new class <code>ScrubTest.java</code>:</p>
<p><code></p>
<pre>public class ScrubTest
  {
  @Test
  public void testMain() throws Exception
    {
    ScrubTest tester = new ScrubTest();
    Fields fieldDeclaration = new Fields( &quot;doc_id&quot;, &quot;token&quot; );
    ScrubFunction scrub = new ScrubFunction( fieldDeclaration );
    assertEquals( &quot;Scrub&quot;, &quot;foo bar&quot;, scrub.scrubText( &quot;FoO BAR  &quot; ) );
    }
  }</pre>
<p></code></p>
<p>This is a particularly good place for a unit test. Scrubbing tokens is a likely point at which edge cases get encountered at scale. In practice, you’d probably want to define even more unit tests.</p>
<p>Next, going back to the <code>Main.java</code> module, let’s add sink taps for writing out trapped data and checkpointed data:</p>
<p><code></p>
<pre>String trapPath = args[ 4 ];
String checkPath = args[ 5 ];
Tap trapTap = new Hfs( new TextDelimited( true, &quot;\t&quot; ), trapPath );
Tap checkTap = new Hfs( new TextDelimited( true, &quot;\t&quot; ), checkPath );</pre>
<p></code></p>
<p>Next we’ll modify the head of the existing pipe assembly for TF-IDF to incorporate a <a href="http://docs.cascading.org/cascading/2.0/userguide/htmlsingle/#N212F4">Stream Assertion</a>. We use an <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/assertion/AssertMatches.html">AssertMatches</a> to define the expected pattern for input data. Then we apply <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/AssertionLevel.html#STRICT">AssertionLevel.STRICT</a> to force validation of the data:</p>
<p><code></p>
<pre>// use a stream assertion to validate the input data
Pipe docPipe = new Pipe( &quot;token&quot; );
AssertMatches assertMatches = new AssertMatches( &quot;doc\\d+\\s.*&quot; );
docPipe = new Each( docPipe, AssertionLevel.STRICT, assertMatches );</pre>
<p></code></p>
<p>Next we’ll add a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/Debug.html">Debug</a> and <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/DebugLevel.html#VERBOSE">DebugLevel.VERBOSE</a> to the <em>D</em> branch, to trace the tuple values in the flow there:</p>
<p><code></p>
<pre>// example use of a debug, to observe tuple stream; turn off below
dfPipe = new Each( dfPipe, DebugLevel.VERBOSE, new Debug( true ) );</pre>
<p></code></p>
<p>Next we’ll add a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/Checkpoint.html">Checkpoint</a> after the join of the <em>DF</em> and <em>D</em> branches. That forces the tuples at this point in the workflow to be persisted to HDFS:</p>
<p><code></p>
<pre>// create a checkpoint, to observe the intermediate data in DF stream
Checkpoint idfCheck = new Checkpoint( &quot;checkpoint&quot;, idfPipe );
Pipe tfidfPipe = new CoGroup( tfPipe, tf_token, idfCheck, df_token );</pre>
<p></code></p>
<p>Next we have a relatively more complex set of taps to connect in the <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/flow/FlowDef.html">FlowDef</a>, to include output data for TDD-related features:</p>
<p><code></p>
<pre>// connect the taps, pipes, traps, checkpoints, etc., into a flow                                                                                         
FlowDef flowDef = FlowDef.flowDef()
 .setName( &quot;tfidf&quot; )
 .addSource( docPipe, docTap )
 .addSource( stopPipe, stopTap )
 .addTailSink( tfidfPipe, tfidfTap )
 .addTailSink( wcPipe, wcTap )
 .addTrap( docPipe, trapTap )
 .addCheckpoint( idfCheck, checkTap );</pre>
<p></code></p>
<p>Last, we’ll specify the <em>verbosity</em> level for the debug trace, and the <em>strictness</em> level for the stream assertion:</p>
<p><code></p>
<pre>// set to DebugLevel.VERBOSE for trace, or DebugLevel.NONE in production
flowDef.setDebugLevel( DebugLevel.VERBOSE );
// set to AssertionLevel.STRICT for all assertions, or AssertionLevel.NONE in production
flowDef.setAssertionLevel( AssertionLevel.STRICT );</pre>
<p></code></p>
<p>Modify the <code>Main</code> method to make those changes, then build a JAR file. You should be good to go. For those keeping score, the resulting physical plan in MapReduce for <em>Part 6</em> now uses twelve mappers and nine reducers. In other words, we added one mapper as the overhead for gaining lots of test features.</p>
<p>The diagram for the Cascading flow will be in the <code>dot/</code> subdirectory after the app runs. Here we have annotated it to show where the <em>mapper</em> and <em>reducer</em> phases are running, and also the sections which were added since <em>Part 5</em>:</p>
<p><a href="http://www.cascading.org/files/2012/08/tfidf.png"><img src="http://www.cascading.org/files/2012/08/tfidf-209x300.png" alt="" title="TF-IDF flow diagram – “Impatient”, Part 6" width="209" height="300" class="aligncenter size-medium wp-image-200" /></a></p>
<p>If you want to read in more detail about the classes in the Cascading API which were used, see the Cascading 2.0 <a href="http://docs.cascading.org/cascading/2.0/userguide/html/">User Guide</a> and <a href="http://docs.cascading.org/cascading/2.0/javadoc/">JavaDoc</a>.</p>
<h2>Build</h2>
<p>The build for this example is based on using <a href="http://gradle.org/">Gradle</a>. The script is in <code>build.gradle</code> and to generate an <a href="https://www.jetbrains.com/idea/">IntelliJ project</a> use:</p>
<p><code></p>
<pre>gradle ideaModule</pre>
<p></code></p>
<p>To build the sample app from the command line use:</p>
<p><code></p>
<pre>gradle clean jar</pre>
<p></code></p>
<p>What you should have at this point is a JAR file which is nearly ready to drop into your <a href="https://maven.apache.org/">Maven</a> repo — almost. Actually, we provide a community jar repository for Cascading libraries and extensions at <a href="http://conjars.org">http://conjars.org</a></p>
<h2>Run</h2>
<p>Before running this sample app, you’ll need to have a supported release of <a href="http://hadoop.apache.org/">Apache Hadoop</a> installed. Here’s what was used to develop and test our example code:</p>
<p><code></p>
<pre>$ hadoop version
Hadoop 1.0.3</pre>
<p></code></p>
<p>Be sure to set your <code>HADOOP_HOME</code> environment variable. Then clear the <code>output</code> directory (Apache Hadoop insists, if you&#8217;re running in standalone mode) and run the app:</p>
<p><code></p>
<pre>rm -rf output
hadoop jar ./build/libs/impatient.jar data/rain.txt output/wc data/en.stop output/tfidf output/trap output/check</pre>
<p></code></p>
<p>The output log should include a warning, based on the stream assertion, which looks like this:</p>
<p><code></p>
<pre>12/08/06 14:15:07 WARN stream.TrapHandler: exception trap on branch: 'token', for fields: [{2}:'doc_id', 'text'] tuple: ['zoink', 'null']
cascading.operation.AssertionException: argument tuple: ['zoink', 'null'] did not match: doc\d+\s.*
    at cascading.operation.assertion.BaseAssertion.throwFail(BaseAssertion.java:107)
    at cascading.operation.assertion.AssertMatches.doAssert(AssertMatches.java:84)
    at cascading.flow.stream.ValueAssertionEachStage.receive(ValueAssertionEachStage.java:57)
    at cascading.flow.stream.ValueAssertionEachStage.receive(ValueAssertionEachStage.java:33)
    at cascading.flow.stream.SourceStage.map(SourceStage.java:102)
    at cascading.flow.stream.SourceStage.run(SourceStage.java:58)
    at cascading.flow.hadoop.FlowMapper.run(FlowMapper.java:124)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
    at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212)</pre>
<p></code></p>
<p>That is expected behavior. We directed the API to show warning when stream assertions failed. The data which caused this warning will get trapped.</p>
<p>Not too far after that point in the log, there should be debug output which looks like the following:</p>
<p><code></p>
<pre>12/08/06 14:15:46 INFO hadoop.FlowReducer: sinking to: TempHfs[&quot;SequenceFile[ ['df_count', 'df_token', 'lhs_join']]&quot;][DF/93669/]
['df_count', 'df_token', 'lhs_join']
['1', 'air', '1']
['3', 'area', '1']
['1', 'australia', '1']
['1', 'broken', '1']</pre>
<p></code></p>
<p>&#8230; plus several more lines. That is the result of our debug trace.</p>
<p>Output text gets stored in the partition file <code>output/tfidf</code> which you can then verify:</p>
<p><code></p>
<pre>more output/tfidf/part-00000
more output/trap/part-m-00001-00000 
more output/check/part-00000</pre>
<p></code></p>
<p>Notice the data tuple <code>output/trap</code>:</p>
<p><code></p>
<pre>zoink   null</pre>
<p></code></p>
<p>That did not match the regex <code>doc\\d+\\s.*</code> which was specified by the stream assertion.</p>
<p>Here’s a <a href="https://gist.github.com/3044049">log file</a> from our run of the sample app, part 6. If your run looks terribly different, something is probably not set up correctly.</p>
<p>To run this same app on the Amazon AWS <a href="http://aws.amazon.com/elasticmapreduce/">Elastic MapReduce</a> service, based on their <a href="http://aws.amazon.com/developertools/2264">command line interface</a>, use the following commands. Be sure to replace <code>temp.cascading.org</code> with your own <a href="http://aws.amazon.com/s3/">S3</a> bucket name:</p>
<pre><code>s3cmd put build/libs/impatient.jar s3://temp.cascading.org/impatient/part6.jar
s3cmd put data/rain.txt s3://temp.cascading.org/impatient/
s3cmd put data/en.stop s3://temp.cascading.org/impatient/

elastic-mapreduce --create --name &quot;TF-IDF&quot; \
  --jar s3n://temp.cascading.org/impatient/part6.jar \
  --arg s3n://temp.cascading.org/impatient/rain.txt \
  --arg s3n://temp.cascading.org/impatient/out/wc \
  --arg s3n://temp.cascading.org/impatient/en.stop \
  --arg s3n://temp.cascading.org/impatient/out/tfidf \
  --arg s3n://temp.cascading.org/impatient/out/trap \
  --arg s3n://temp.cascading.org/impatient/out/check</code></pre>
<p>Drop us a line on the <a href="https://groups.google.com/forum/?fromgroups#!forum/cascading-user">cascading-user</a> email forum. Or visit one of our user group meetings. [Coming up real soon&#8230;]</p>
<p>Also, compare these other excellent implementations of the example apps here – by <a href="http://sujitpal.blogspot.com/2012/08/scalding-for-impatient.html">Sujit Pal</a> in <a href="https://github.com/twitter/scalding/wiki">Scalding</a> and by <a href="https://github.com/Quantisan/Impatient">Paul Lam</a> in <a href="https://github.com/nathanmarz/cascalog/wiki">Cascalog</a>.</p>
<p>Stay tuned for the next installments of our <a href="http://www.cascading.org/category/impatient/" target="_blank">Cascading for the Impatient</a> series.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2012/08/07/cascading-for-the-impatient-part-6/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Cascading for the Impatient, Part 5</title>
		<link>http://www.cascading.org/2012/07/31/cascading-for-the-impatient-part-5/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=cascading-for-the-impatient-part-5</link>
		<comments>http://www.cascading.org/2012/07/31/cascading-for-the-impatient-part-5/#comments</comments>
		<pubDate>Tue, 31 Jul 2012 23:17:25 +0000</pubDate>
		<dc:creator>Paco Nathan</dc:creator>
				<category><![CDATA[Impatient]]></category>
		<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=185</guid>
		<description><![CDATA[<p>In our fourth installment of this series we showed how to use HashJoin on two pipes, to perform “stop words” filtering at scale in a Cascading 2.0 application. If you haven’t read that yet, it’s probably best to start there. Today&#8217;s lesson builds on that&#8230; <a class="more-link" href="http://www.cascading.org/2012/07/31/cascading-for-the-impatient-part-5/">Continue reading <span class="meta-nav">&#8594;</span></a></p>
]]></description>
				<content:encoded><![CDATA[<p>In our <a href="http://www.cascading.org/2012/07/24/cascading-for-the-impatient-part-4/" title="Cascading for the Impatient, Part 4" target="_blank">fourth installment of this series</a> we showed how to use <a target="_blank" href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/HashJoin.html">HashJoin</a> on two pipes, to perform “stop words” filtering at scale in a <a href="http://www.cascading.org/" target="_blank">Cascading 2.0</a> application. If you haven’t read that yet, it’s probably best to start there.</p>
<p>Today&#8217;s lesson builds on that same Word Count app and now implements <a href="http://en.wikipedia.org/wiki/Tf*idf" target="_blank">TF-IDF</a> in Cascading. We’ll show how to use a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/assembly/SumBy.html" target="_blank">SumBy</a> and a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/CoGroup.html" target="_blank">CoGroup</a> to aggregate the data needed, and then how to use an <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/expression/ExpressionFunction.html" target="_blank">ExpressionFunction</a> to calculate the TF-IDF weights. We also continue to show best practices for workflow orchestration and <a href="http://en.wikipedia.org/wiki/Test-driven_development" target="_blank">test-driven development</a> (TDD) at scale.</p>
<h2>Theory</h2>
<p>Fortunately, most all of the data required to calculate <a target="_blank" href="http://en.wikipedia.org/wiki/Tf*idf">TF-IDF</a> weight was already available in our Word Count example in <em>Part 4</em>. However, we’ll need to revise the overall workflow, adding more pipe assemblies to it.</p>
<p>TF-IDF calculates a metric for each token which indicates how “important” that token is to a document <em>within the context of a collection of documents</em>. The metric is calculated based on relative frequencies. On one hand, tokens which appear in most documents tend to have very low TF-IDF weights. On the other hand, tokens which are less common but appear multiple times in a few documents tend to have very high TF-IDF weights. Consequently, the TF-IDF algorithm gets used to drive the indexing in some text search engines, such as <a target="_blank" href="http://lucene.apache.org/core/">Apache Lucene</a>. In particular, TF-IDF provides an effective way to rank documents for a search query. For a good discussion of this in gory detail, see the <a target="_blank" href="http://lucene.apache.org/core/old_versioned_docs/versions/2_9_0/api/all/org/apache/lucene/search/Similarity.html">Similarity class</a> in Lucene.</p>
<p>Note that in the literature, <code>token</code> and <code>term</code> may be used interchangeably for this sample app. More advanced text analytics might look at sequences of words, in which case a <code>term</code> becomes a more complex structure. However, we’re only looking at single words.</p>
<p>We’ll need the following components to calculate TF-IDF:</p>
<ul>
<li><em>term count</em>: number of times a given term appears in a given document</li>
<li><em>document frequency</em>: how frequently a given term appears across all documents</li>
<li><em>number of terms</em>: total number of terms in a given document</li>
<li><em>document count</em>: total number of documents</li>
</ul>
<p>Slight modifications to Word Count provides the means to get both <em>term count</em> and <em>document frequency</em>, along with the other two components which get calculated almost as by-products. In this sense, we get to leverage Cascading by re-using the results of some pipes within our workflow. A conceptual diagram for this implementation of TF-IDF in Cascading is shown as:</p>
<p><a href="http://www.cascading.org/files/2012/08/plumb5.png"><img src="http://www.cascading.org/files/2012/08/plumb5-150x150.png" alt="" title="Conceptual Diagram - &quot;Impatient&quot;, Part 5" width="150" height="150" class="aligncenter size-thumbnail wp-image-205" /></a></p>
<h2>Source</h2>
<p>Download source for this example on <a target="_blank" href="https://github.com/Cascading/Impatient/tree/master/part5">GitHub</a>. You’ll need to clone the whole of this multi-part series:</p>
<pre><code>git clone git://github.com/Cascading/Impatient.git</code></pre>
<p>For quick reference, the source code and a log for this example are listed in a <a target="_blank" href="https://gist.github.com/3043791">gist</a>. The input data stays the same as in the <a target="_blank" href="https://gist.github.com/2911686">earlier code</a>.</p>
<p>First, let’s add another sink tap to write the TF-IDF weights as an output data set:</p>
<p><code></p>
<pre>String tfidfPath = args[ 3 ];
Tap tfidfTap = new Hfs( new TextDelimited( true, &quot;\t&quot; ), tfidfPath );</pre>
<p></code></p>
<p>Next we’ll modify the existing pipe assemblies for Word Count, beginning immediately after the “stop words” filter. We add the following line to retain only the <code>doc_id</code> and <code>token</code> fields:</p>
<p><code></p>
<pre>tokenPipe = new Retain( tokenPipe, fieldSelector );</pre>
<p></code></p>
<p>Then we re-use the intermediate results from <code>tokenPipe</code>, creating three different branches in the workflow. The first addresses <em>term counts</em>:</p>
<p><code></p>
<pre>// one branch of the flow tallies the token counts for term frequency (TF)
Pipe tfPipe = new Pipe( &quot;TF&quot;, tokenPipe );
tfPipe = new GroupBy( tfPipe, new Fields( &quot;doc_id&quot;, &quot;token&quot; ) );
Fields tf_count = new Fields( &quot;tf_count&quot; );
tfPipe = new Every( tfPipe, Fields.ALL, new Count( tf_count ), Fields.ALL );
Fields tf_token = new Fields( &quot;tf_token&quot; );
tfPipe = new Rename( tfPipe, token, tf_token );</pre>
<p></code></p>
<p>At that point, we have <em>TF</em> values for each token.</p>
<p>In a second branch we’ll calculate <em>D</em>, the total number of documents in a way which can be consumed later in a join. This uses a built-in partial aggregate operation called <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/assembly/SumBy.html" target="_blank">SumBy</a>:</p>
<p><code></p>
<pre>// one branch counts the number of documents (D)
Fields doc_id = new Fields( &quot;doc_id&quot; );
Fields tally = new Fields( &quot;tally&quot; );
Fields rhs_join = new Fields( &quot;rhs_join&quot; );
Fields n_docs = new Fields( &quot;n_docs&quot; );
Pipe dPipe = new Unique( &quot;D&quot;, tokenPipe, doc_id );
dPipe = new Each( dPipe, new Insert( tally, 1 ), Fields.ALL );
dPipe = new Each( dPipe, new Insert( rhs_join, 1 ), Fields.ALL );
dPipe = new SumBy( dPipe, rhs_join, tally, n_docs, long.class );</pre>
<p></code></p>
<p>This part may seem less than intuitive&#8230; and it is a bit odd. We need a total document count as a field, in each tuple for the RHS of the join. That keeps our processing parallel, allowing this calculation to scale-out horizontally.</p>
<p>The third branch calculates <em>DF</em> as a step toward <em>inverse document frequency</em> per token:</p>
<p><code></p>
<pre>// one branch tallies the token counts for document frequency (DF)
Pipe dfPipe = new Unique( &quot;DF&quot;, tokenPipe, Fields.ALL );
dfPipe = new GroupBy( dfPipe, token );
Fields df_count = new Fields( &quot;df_count&quot; );
Fields df_token = new Fields( &quot;df_token&quot; );
Fields lhs_join = new Fields( &quot;lhs_join&quot; );
dfPipe = new Every( dfPipe, Fields.ALL, new Count( df_count ), Fields.ALL );
dfPipe = new Rename( dfPipe, token, df_token );
dfPipe = new Each( dfPipe, new Insert( lhs_join, 1 ), Fields.ALL );</pre>
<p></code></p>
<p>Now we have all the components needed to calculate TF-IDF weights. We’ll use two kinds of joins – a <a target="_blank" href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/HashJoin.html">HashJoin</a> followed by a <a target="_blank" href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/CoGroup.html">CoGroup</a> – to merge the three branches together:</p>
<p><code></p>
<pre>// join to bring together all the components for calculating TF-IDF
// the D side of the join is smaller, so it goes on the RHS
Pipe idfPipe = new HashJoin( dfPipe, lhs_join, dPipe, rhs_join );

// the IDF side of the join is smaller, so it goes on the RHS
Pipe tfidfPipe = new CoGroup( tfPipe, tf_token, idfPipe, df_token );</pre>
<p></code></p>
<p>Then we calculate the weights using an <a target="_blank" href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/expression/ExpressionFunction.html">ExpressionFunction</a> in Cascading:</p>
<p><code></p>
<pre>// calculate the TF-IDF weights, per token, per document                                                                                                  
Fields tfidf = new Fields( &quot;tfidf&quot; );
String expression = &quot;(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )&quot;;
ExpressionFunction tfidfExpression = new ExpressionFunction( tfidf, expression, Double.class );
Fields tfidfArguments = new Fields( &quot;tf_count&quot;, &quot;df_count&quot;, &quot;n_docs&quot; );
tfidfPipe = new Each( tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL );
fieldSelector = new Fields( &quot;tf_token&quot;, &quot;doc_id&quot;, &quot;tfidf&quot; );
tfidfPipe = new Retain( tfidfPipe, fieldSelector );
tfidfPipe = new Rename( tfidfPipe, tf_token, token );</pre>
<p></code></p>
<p>Now we can get back to the remainder of the workflow. We’ll keep the actual Word Count metrics, since those are useful for testing:</p>
<p><code></p>
<pre>// keep track of the word counts, which are useful for QA                                                                                                 
Pipe wcPipe = new Pipe( &quot;wc&quot;, tfPipe );

Fields count = new Fields( &quot;count&quot; );
wcPipe = new SumBy( wcPipe, tf_token, tf_count, count, long.class );
wcPipe = new Rename( wcPipe, tf_token, token );</pre>
<p></code></p>
<p>Last, we’ll add another sink tap to the <a target="_blank" href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/flow/FlowDef.html">FlowDef</a>, to include output data for our TF-IDF weights:</p>
<p><code></p>
<pre>// connect the taps, pipes, etc., into a flow                                                                                                             
FlowDef flowDef = FlowDef.flowDef()
 .setName( &quot;tfidf&quot; )
 .addSource( docPipe, docTap )
 .addSource( stopPipe, stopTap )
 .addTailSink( tfidfPipe, tfidfTap )
 .addTailSink( wcPipe, wcTap );</pre>
<p></code></p>
<p>We’ll change the name of the resulting <a target="_blank" href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/flow/Flow.html">Flow</a> too, to keep our code properly descriptive:</p>
<p><code></p>
<pre>// write a DOT file and run the flow                                                                                                                      
Flow tfidfFlow = flowConnector.connect( flowDef );
tfidfFlow.writeDOT( &quot;dot/tfidf.dot&quot; );
tfidfFlow.complete();</pre>
<p></code></p>
<p>Modify the <code>Main</code> method to make those changes, then build a JAR file. You should be good to go. For those keeping score, the resulting physical plan in Cascading for <em>Part 5</em> now uses eleven mappers and nine reducers. That amount jumped by 5x since our previous example.</p>
<p>The diagram for the Cascading flow will be in the `dot/` subdirectory after the app runs. Here we have annotated it to show where the *mapper* and *reducer* phases are running, and also the sections which were added since _Part 4_:</p>
<p><a href="http://www.cascading.org/files/2012/08/tfidf1.png"><img src="http://www.cascading.org/files/2012/08/tfidf1-141x300.png" alt="" title="TF-IDF flow diagram - &quot;Impatient&quot;, Part 5" width="141" height="300" class="aligncenter size-medium wp-image-195" /></a></p>
<p>If you want to read in more detail about the classes in the Cascading API which were used, see the Cascading 2.0 <a target="_blank" href="http://docs.cascading.org/cascading/2.0/userguide/html/">User Guide</a> and <a target="_blank" href="http://docs.cascading.org/cascading/2.0/javadoc/">JavaDoc</a>.</p>
<h2>Build</h2>
<p>The build for this example is based on using <a target="_blank" href="http://gradle.org/">Gradle</a>. The script is in <code>build.gradle</code> and to generate an <a target="_blank" href="https://www.jetbrains.com/idea/">IntelliJ project</a> use:</p>
<p><code></p>
<pre>gradle ideaModule</pre>
<p></code></p>
<p>To build the sample app from the command line use:</p>
<p><code></p>
<pre>gradle clean jar</pre>
<p></code></p>
<p>What you should have at this point is a JAR file which is nearly ready to drop into your <a target="_blank" href="https://maven.apache.org/">Maven</a> repo — almost. Actually, we provide a community jar repository for Cascading libraries and extensions at <a target="_blank" href="http://conjars.org">http://conjars.org</a></p>
<h2>Run</h2>
<p>Before running this sample app, you’ll need to have a supported release of <a target="_blank" href="http://hadoop.apache.org/">Apache Hadoop</a> installed. Here’s what was used to develop and test our example code:</p>
<p><code></p>
<pre>$ hadoop version
Hadoop 1.0.3</pre>
<p></code></p>
<p>Be sure to set your <code>HADOOP_HOME</code> environment variable. Then clear the <code>output</code> directory (Apache Hadoop insists, if you&#8217;re running in standalone mode) and run the app:</p>
<p><code></p>
<pre>rm -rf output
hadoop jar ./build/libs/impatient.jar data/rain.txt output/wc data/en.stop output/tfidf</pre>
<p></code></p>
<p>Output text gets stored in the partition file <code>output/tfidf</code> which you can then verify:</p>
<p><code></p>
<pre>more output/tfidf/part-00000</pre>
<p></code></p>
<p>BTW, did you notice what the TF-IDF weights for the tokens <code>rain</code> and <code>shadow</code> were? Those represent what the documents have in common. How do those compare with weights for the other tokens? Conversely, consider the weights for <code>australia</code> (high weight) or <code>area</code> (different weights).</p>
<p>Here’s a <a target="_blank" href="https://gist.github.com/3043791">log file</a> from our run of the sample app, part 5. If your run looks terribly different, something is probably not set up correctly. Drop us a line on the <a target="_blank" href="https://groups.google.com/forum/?fromgroups#!forum/cascading-user">cascading-user</a> email forum. Or visit one of our user group meetings. [Coming up real soon&#8230;]</p>
<p>Also, compare these other excellent implementations of the example apps here – by <a href="http://sujitpal.blogspot.com/2012/08/scalding-for-impatient.html">Sujit Pal</a> in <a href="https://github.com/twitter/scalding/wiki">Scalding</a> and by <a href="https://github.com/Quantisan/Impatient">Paul Lam</a> in <a href="https://github.com/nathanmarz/cascalog/wiki">Cascalog</a>.</p>
<p>For those familiar with <a href="http://pig.apache.org/">Apache Pig</a>, we have included a <a href="https://raw.github.com/Cascading/Impatient/master/part5/src/scripts/tfidf.pig" title="comparable Apache Pig script" target="_blank">comparable script</a>, and to run that:</p>
<pre><code>rm -rf output
mkdir -p dot
pig -p docPath=./data/rain.txt -p wcPath=./output/wc -p stopPath=./data/en.stop -p tfidfPath=./output/tfidf ./src/scripts/tfidf.pig
</code></pre>
<p>Stay tuned for the next installments of our <a href="http://www.cascading.org/category/impatient/" target="_blank">Cascading for the Impatient</a> series.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2012/07/31/cascading-for-the-impatient-part-5/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Cascading for the Impatient, Part 4</title>
		<link>http://www.cascading.org/2012/07/24/cascading-for-the-impatient-part-4/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=cascading-for-the-impatient-part-4</link>
		<comments>http://www.cascading.org/2012/07/24/cascading-for-the-impatient-part-4/#comments</comments>
		<pubDate>Tue, 24 Jul 2012 15:13:46 +0000</pubDate>
		<dc:creator>Paco Nathan</dc:creator>
				<category><![CDATA[Impatient]]></category>
		<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=183</guid>
		<description><![CDATA[<p>In our third installment of this series we showed how to write a custom Operation for a Cascading 2.0 application. If you haven’t read that yet, it’s probably best to start there. Today&#8217;s lesson takes that same Word Count app and expands on it to&#8230; <a class="more-link" href="http://www.cascading.org/2012/07/24/cascading-for-the-impatient-part-4/">Continue reading <span class="meta-nav">&#8594;</span></a></p>
]]></description>
				<content:encoded><![CDATA[<p>In our <a href="http://www.cascading.org/2012/07/17/cascading-for-the-impatient-part-3/" title="Cascading for the Impatient, Part 3" target="_blank">third installment of this series</a> we showed how to write a custom <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/package-summary.html">Operation</a> for a <a href="http://www.cascading.org/">Cascading 2.0</a> application. If you haven’t read that yet, it’s probably best to start there.</p>
<p>Today&#8217;s lesson takes that same Word Count app and expands on it to implement a <a href="http://en.wikipedia.org/wiki/Stop_words">stop words</a> filter, which is a list of tokens to nix from the stream. We’ll show how to use <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/HashJoin.html">HashJoin</a> on two pipes, so we can perform that filtering at scale. Again, this code is leading toward an implementation of <a href="http://en.wikipedia.org/wiki/Tf*idf">TF-IDF</a> in Cascading. We’ll show best practices for workflow orchestration and <a href="http://en.wikipedia.org/wiki/Test-driven_development">test-driven development</a> (TDD) at scale.</p>
<h2>Theory</h2>
<p>The first question to consider is, why do we want to use a <a href="http://en.wikipedia.org/wiki/Stop_words">stop words</a> list? After all, the TF-IDF algorithm is supposed to filter out the less significant words anyway. Why would we need to include additional filtering if the TF-IDF is implemented correctly?</p>
<p>Use of a stop words list originated in work by <a href="http://en.wikipedia.org/wiki/Hans_Peter_Luhn">Hans Peter Luhn</a> at IBM Research, during the dawn of computing. The reasons for it are two-fold. On one hand, consider that the most common words in any given natural language are generally not useful for <a href="http://en.wikipedia.org/wiki/Text_analytics">text analytics</a>. For example in English, words such as “the”, “of”, “and” are probably not what you want to search, and probably not interesting for Word Count metrics. They represent the <a href="http://en.wikipedia.org/wiki/Long_Tail">long tail</a> of the token distribution: high frequency, low semantic value. Consequently, they cause the bulk of the processing. Natural languages tend to have on the order of <strong>10^5</strong> words, so the potential size of any stop words list is nicely bounded. Filtering those high-frequency words out of the token stream reduces the amount processing required later in the workflow, dramatically.</p>
<p>On the other hand, you may also want to remove some words explicitly from the token stream. This almost always comes up in practice, especially when working with public discussions such as social network comments. Think about it, what are some of the most common words posted online in comments? Words which are not the most common words in “polite” English? Based on the math for TF-IDF, those would tend to get ranked highest. Do you really want those words to bubble up to the “most significant” positions in your text analytics? In automated systems which leverage <a href="http://en.wikipedia.org/wiki/Unsupervised_learning">unsupervised learning</a>, this can lead to <strong>highly embarrassing</strong> situations. <em>Caveat machinator</em>.</p>
<p>Next, let’s consider about working with a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/joiner/package-summary.html">Joiner</a> in Cascading. We will have two pipes, one for the “scrubbed” token stream and another for the stop words list. We want to filter all instances of tokens from the stop words list out of the token stream. If we weren’t working in MapReduce, a naive approach would simply load the stop words list into a hashtable, then iterate through our token stream to lookup each token in the hashtable and delete it if found. If we were coding in Hadoop directly, a less naive approach would be to put the stop words list into the <a href="http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/filecache/DistributedCache.html">distributed cache</a> and have a job step which loads it during setup, then rinse/lather/repeat from the naive coding approach described above.</p>
<p>Instead we want to leverage the workflow orchestration in Cascading. One might try to write a custom operation in Cascading, as we did in <em>Part 3</em> — e.g., a custom <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/filter/package-summary.html">Filter</a>. That sounds like extra work, plus also extra code to verify and maintain, when the built-in primatives will to tend to be more efficient anyway.</p>
<p>Cascading provides for <em>joins</em> on pipes, and conceptually a <a href="http://stackoverflow.com/questions/406294/left-join-and-left-outer-join-in-sql-server">Left Outer Join</a> would solve our requirement to filter stop words. Think of joining the token stream with the stop words list. When the result is non-null, the join has identified a stop word. Discard it.</p>
<p>Understand that there’s a big problem with using joins in MapReduce. Outside of the context of a <a href="http://en.wikipedia.org/wiki/Relational_database">relational database</a>, arbitrary joins do not work efficiently. Suppose you have <strong>N</strong> items in one tuple stream and <strong>M</strong> items in another, and want to join them? In the general case, for an arbitrary join, that requires <strong>N x M</strong> operations and also introduces a <a href="http://en.wikipedia.org/wiki/Data_dependency">data dependeny</a>, such that the join cannot be performed in parallel. If both <strong>N</strong> and <strong>M</strong> are relatively large, say in the millions of tuples, then we’d end up processing <strong>10^12</strong> operations on a single processor — which kind of defeats the purpose, in terms of leveraging MapReduce.</p>
<p>Fortunately, if some of that data is <a href="http://en.wikipedia.org/wiki/Sparse_matrix">sparse</a> then we can use specific variants of joins to compute efficiently in parallel. Cascading includes a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/HashJoin.html">HashJoin</a> which joins two or more tuple streams into a single stream via a Joiner — when all but one tuple stream are small enough to fit into memory. In other words, given some insights about the “shape” of the data, when you have a large data set (non-sparse) you can join with one or more small data sets (sparse) in memory. </p>
<p>A join has a left-hand side (LHS) and a right-hand side (RHS); in Cascading we put the <em>sparser</em> data on the right-hand side. So the HashJoin implements a non-blocking “asymmetrical join” or “replicated join”, where the left-most side will not block (accumulate into memory) in order to complete the join, but the right-most sides will.</p>
<p>Recall that stop words lists tend to be bounded at approximately <strong>10^5</strong>, which is relatively sparse when compared with an arbitrarily large token stream. In typical “web scale” text analytics use cases for TF-IDF, that might be in the range billions of tokens, i.e., several orders of magnitude larger than our largest possible stop words list. Sounds like a great use case for HashJoin.</p>
<p>A conceptual diagram for this implementation of Word Count in Cascading is shown as:</p>
<p><a href="http://www.cascading.org/files/2012/08/plumb4.png"><img src="http://www.cascading.org/files/2012/08/plumb4.png" alt="" title="Conceptual Diagram - &quot;Impatient&quot;, Part 4" width="890" height="460" class="aligncenter size-full wp-image-204" /></a></p>
<h2>Source</h2>
<p>Download source for this example on <a href="https://github.com/Cascading/Impatient/tree/master/part4" title="Part 4, code repo on GitHub" target="_blank">GitHub</a>. You’ll need to clone the whole of this multi-part series:</p>
<pre><code>git clone git://github.com/Cascading/Impatient.git</code></pre>
<p>For quick reference, the source code and a log for this example are listed in a <a href="https://gist.github.com/3043745">gist</a>. The input data stays the same as in the <a href="https://gist.github.com/2911686">earlier code</a>.</p>
<p>This example in <em>Part 4</em> uses a <strong>HashJoin</strong> in Cascading to implement a stop words list, filtering some words out of the token stream prior to counting.</p>
<p>First, let’s add another source tap to read the stop words list as an input data set:</p>
<p><code></p>
<pre>String stopPath = args[ 2 ];
Fields stop = new Fields( &quot;stop&quot; );
Tap stopTap = new Hfs( new TextDelimited( stop, true, &quot;\t&quot; ), stopPath );</pre>
<p></code></p>
<p>Next we’ll insert another pipe into the assembly, placing <code>tokenPipe</code> between our “scrub” and “count” sections of our workflow. That’s where the <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/HashJoin.html">HashJoin</a> gets performed, implementing a left join:</p>
<p><code></p>
<pre>// perform a left join to remove stop words, discarding the rows
// which joined with stop words, i.e., were non-null after left join
Pipe stopPipe = new Pipe( &quot;stop&quot; );
Pipe tokenPipe = new HashJoin( docPipe, token, stopPipe, stop, new LeftJoin() );</pre>
<p></code></p>
<p>Next we discard the non-null results from the left join, using a <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/regex/RegexFilter.html">RegexFilter</a>:</p>
<p><code></p>
<pre>tokenPipe = new Each( tokenPipe, stop, new RegexFilter( &quot;^$&quot; ) );</pre>
<p></code></p>
<p>Now this new <code>tokenPipe</code> can be fitted back into the <code>wcPipe</code> which we used before. The workflow continues on much the same from there:</p>
<p><code></p>
<pre>Pipe wcPipe = new Pipe( &quot;wc&quot;, tokenPipe );</pre>
<p></code></p>
<p>Last, we’ll add the additional source tap to the <a href="http://docs.cascading.org/cascading/2.0/javadoc/cascading/flow/FlowDef.html">FlowDef</a>, to include input data for our stop words list:</p>
<p><code></p>
<pre>// connect the taps, pipes, etc., into a flow                                                                                                             
FlowDef flowDef = FlowDef.flowDef()
 .setName( &quot;wc&quot; )
 .addSource( docPipe, docTap )
 .addSource( stopPipe, stopTap )
 .addTailSink( wcPipe, wcTap );</pre>
<p></code></p>
<p>Modify the <code>Main</code> method to make those changes, then build a JAR file. You should be good to go. For those keeping score, the resulting physical plan in MapReduce for <em>Part 4</em> still uses one mapper and one reducer.</p>
<p>The diagram for the Cascading flow will be in the <code>dot/</code> subdirectory after the app runs. Here we have annotated it to show where the <em>mapper</em> and <em>reducer</em> phases are running, and also the section which was added since <em>Part 3</em>:</p>
<p><a href="http://www.cascading.org/files/2012/08/wc_part4.png"><img src="http://www.cascading.org/files/2012/08/wc_part4-237x300.png" alt="" title="Word Count flow diagram - &quot;Impatient&quot;, Part 4" width="237" height="300" class="aligncenter size-medium wp-image-189" /></a></p>
<p>If you want to read in more detail about the classes in the Cascading API which were used, see the Cascading 2.0 <a href="http://docs.cascading.org/cascading/2.0/userguide/html/">User Guide</a> and <a href="http://docs.cascading.org/cascading/2.0/javadoc/">JavaDoc</a>.</p>
<h2>Build</h2>
<p>The build for this example is based on using <a href="http://gradle.org/">Gradle</a>. The script is in <code>build.gradle</code> and to generate an <a href="https://www.jetbrains.com/idea/">IntelliJ project</a> use:</p>
<p><code></p>
<pre>gradle ideaModule</pre>
<p></code></p>
<p>To build the sample app from the command line use:</p>
<p><code></p>
<pre>gradle clean jar</pre>
<p></code></p>
<p>What you should have at this point is a JAR file which is nearly ready to drop into your <a href="https://maven.apache.org/">Maven</a> repo — almost. Actually, we provide a community jar repository for Cascading libraries and extensions at <a href="http://conjars.org">http://conjars.org</a></p>
<h2>Run</h2>
<p>Before running this sample app, you’ll need to have a supported release of <a href="http://hadoop.apache.org/">Apache Hadoop</a> installed. Here’s what was used to develop and test our example code:</p>
<p><code></p>
<pre>$ hadoop version
Hadoop 1.0.3</pre>
<p></code></p>
<p>Be sure to set your <code>HADOOP_HOME</code> environment variable. Then clear the <code>output</code> directory (Apache Hadoop insists, if you&#8217;re running in standalone mode) and run the app:</p>
<p><code></p>
<pre>rm -rf output
hadoop jar ./build/libs/impatient.jar data/rain.txt output/wc data/en.stop</pre>
<p></code></p>
<p>Output text gets stored in the partition file <code>output/wc</code> which you can then verify:</p>
<p><code></p>
<pre>more output/wc/part-00000</pre>
<p></code></p>
<p>Here’s a <a href="https://gist.github.com/3043745">log file</a> from our run of the sample app, part 4. If your run looks terribly different, something is probably not set up correctly. Drop us a line on the <a href="https://groups.google.com/forum/?fromgroups#!forum/cascading-user">cascading-user</a> email forum. Or visit one of our user group meetings. [Coming up real soon&#8230;]</p>
<p>Also, compare these other excellent implementations of the example apps here – by <a href="http://sujitpal.blogspot.com/2012/08/scalding-for-impatient.html">Sujit Pal</a> in <a href="https://github.com/twitter/scalding/wiki">Scalding</a> and by <a href="https://github.com/Quantisan/Impatient">Paul Lam</a> in <a href="https://github.com/nathanmarz/cascalog/wiki">Cascalog</a>.</p>
<p>For those familiar with <a href="http://pig.apache.org/">Apache Pig</a>, we have included a <a href="https://raw.github.com/Cascading/Impatient/master/part4/src/scripts/wc.pig" title="comparable Apache Pig script" target="_blank">comparable script</a>, and to run that:</p>
<pre><code>rm -rf output
mkdir -p dot
pig -p docPath=./data/rain.txt -p wcPath=./output/wc -p stopPath=./data/en.stop ./src/scripts/wc.pig
</code></pre>
<p>Stay tuned for the next installments of our <a href="http://www.cascading.org/category/impatient/" title="Cascading for the Impatient" target="_blank">Cascading for the Impatient</a> series.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2012/07/24/cascading-for-the-impatient-part-4/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Cascading Software Development Kit</title>
		<link>http://www.cascading.org/2012/07/18/cascading-software-development-kit/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=cascading-software-development-kit</link>
		<comments>http://www.cascading.org/2012/07/18/cascading-software-development-kit/#comments</comments>
		<pubDate>Wed, 18 Jul 2012 18:21:26 +0000</pubDate>
		<dc:creator>concurrent</dc:creator>
				<category><![CDATA[News]]></category>

		<guid isPermaLink="false">http://www.cascading.org/?p=181</guid>
		<description><![CDATA[<p>The Cascading SDK is now available for download. The SDK includes Cascading source and jars, and many of the Cascading based tools like Load and Multitool. It also includes at Amazon Elastic MapReduce install script (bootstrap action) that will pre-install all included tools on the&#8230; <a class="more-link" href="http://www.cascading.org/2012/07/18/cascading-software-development-kit/">Continue reading <span class="meta-nav">&#8594;</span></a></p>
]]></description>
				<content:encoded><![CDATA[<p>The <a href="http://www.cascading.org/sdk/">Cascading SDK</a> is now available for <a href="http://www.cascading.org/downloads/">download</a>.</p>
<p>The SDK includes Cascading source and jars, and many of the Cascading based tools like Load and Multitool.</p>
<p>It also includes at Amazon Elastic MapReduce install script (bootstrap action) that will pre-install all included tools on the master node.</p>
]]></content:encoded>
			<wfw:commentRss>http://www.cascading.org/2012/07/18/cascading-software-development-kit/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>
