spark_streaming_examples

Create Spark Streaming Context: ========================================== scala: --------------- import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3 // Create a local StreamingContext with two working thread and batch interval of 1 second. // The master requires 2 cores to prevent from a starvation scenario. val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(1)) // Create a DStream that will connect to hostname:port, like localhost:9999 val lines = ssc.socketTextStream("localhost", 9999) // Split each line into words val words = lines.flatMap(_.split(" ")) import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3 // Count each word in each batch val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) // Print the first ten elements of each RDD generated in this DStream to the console wordCounts.print() ssc.start() // Start the computation ssc.awaitTermination() // Wait for the computation to terminate java: --------------- import org.apache.spark.; import org.apache.spark.api.java.function.; import org.apache.spark.streaming.; import org.apache.spark.streaming.api.java.; import scala.Tuple2; // Create a local StreamingContext with two working thread and batch interval of 1 second SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount") JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1)) // Create a DStream that will connect to hostname:port, like localhost:9999 JavaReceiverInputDStream lines = jssc.socketTextStream("localhost", 9999); // Split each line into words JavaDStream words = lines.flatMap( new FlatMapFunction() { @Override public Iterable call(String x) { return Arrays.asList(x.split(" ")); } }); // Count each word in each batch JavaPairDStream pairs = words.mapToPair( new PairFunction() { @Override public Tuple2 call(String s) { return new Tuple2(s, 1); } }); JavaPairDStream wordCounts = pairs.reduceByKey( new Function2() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); // Print the first ten elements of each RDD generated in this DStream to the console wordCounts.print(); jssc.start(); // Start the computation jssc.awaitTermination(); // Wait for the computation to terminate python: --------------- from pyspark import SparkContext from pyspark.streaming import StreamingContext # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "NetworkWordCount") ssc = StreamingContext(sc, 1) # Create a DStream that will connect to hostname:port, like localhost:9999 lines = ssc.socketTextStream("localhost", 9999) # Split each line into words words = lines.flatMap(lambda line: line.split(" ")) # Count each word in each batch pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) # Print the first ten elements of each RDD generated in this DStream to the console wordCounts.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate ================================================================== nc -lk 9999 $SPARK_HOME/bin/run-example streaming.NetworkWordCount localhost 9999 $SPARK_HOME/bin/run-example streaming.JavaNetworkWordCount localhost 9999 $SPARK_HOME/bin/spark-submit $SPARK_HOME/examples/src/main/python/streaming/network_wordcount.py localhost 9999 ================================================================== $SPARK_HOME/bin/run-example streaming.TwitterPopularTags FlRx3d0n8duIQ0UvGeGtTA DS7TTbxhmQ7oCUlDntpQQRqQllFFOiyNoOMEDD0lA 1643982224-xTfNpLrARoWKxRh9KtFqc7aoB8KAAHkCcfC5vDk PqkbuBqF3AVskgx1OKgXKOZzV7EMWRmRG0p8hvLQYKs ================================================================== $FLUME_HOME/bin/flume-ng agent -n spark-flume --conf $FLUME_HOME/conf -f $FLUME_HOME/test/spark-flume.conf -Dflume.root.logger=DEBUG,console telnet localhost 3000 $SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.FlumeEventCount localhost 1234 ================================================================== $SPARK_HOME/bin/run-example streaming.HdfsWordCount file:/home/orienit/spark/input/test ================================================================== $SPARK_HOME/bin/run-example streaming.QueueStream ================================================================== $SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewGenerator 44444 10 $SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewStream errorRatePerZipCode localhost 44444 $SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.clickstream.PageVie

Dec 3, 2024 - 17:17

0 0

Create Spark Streaming Context:
==========================================
scala:
---------------

import org.apache.spark._

import org.apache.spark.streaming._

import org.apache.spark.streaming.StreamingContext._

// not necessary since Spark 1.3

// Create a local StreamingContext with two working thread and batch interval of 1 second.
// The master requires 2 cores to prevent from a starvation scenario.

val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
val ssc = new StreamingContext(conf, Seconds(1))

// Create a DStream that will connect to hostname:port, like localhost:9999
val lines = ssc.socketTextStream("localhost", 9999)

// Split each line into words
val words = lines.flatMap(_.split(" "))

import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
// Count each word in each batch
val pairs = words.map(word => (word, 1))
val wordCounts = pairs.reduceByKey(_ + _)

// Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.print()

ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate

java:
---------------
import org.apache.spark.*;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.*;
import org.apache.spark.streaming.api.java.*;

import scala.Tuple2;

// Create a local StreamingContext with two working thread and batch interval of 1
second
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1))

// Create a DStream that will connect to hostname:port, like localhost:9999
JavaReceiverInputDStream lines = jssc.socketTextStream("localhost", 9999);

// Split each line into words
JavaDStream words = lines.flatMap(
new FlatMapFunction() {
@Override public Iterable call(String x) {
return Arrays.asList(x.split(" "));
}
});

// Count each word in each batch
JavaPairDStream pairs = words.mapToPair(
new PairFunction() {
@Override public Tuple2 call(String s) {
return new Tuple2(s, 1);
}
});
JavaPairDStream wordCounts = pairs.reduceByKey(
new Function2() {
@Override public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
});

// Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.print();

jssc.start();
// Start the computation
jssc.awaitTermination();
// Wait for the computation to terminate

python:
---------------
from pyspark
import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 1)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 9999)

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate

==================================================================

nc -lk 9999

$SPARK_HOME/bin/run-example streaming.NetworkWordCount localhost 9999

$SPARK_HOME/bin/run-example streaming.JavaNetworkWordCount localhost 9999

$SPARK_HOME/bin/spark-submit $SPARK_HOME/examples/src/main/python/streaming/network_wordcount.py localhost 9999

==================================================================

$SPARK_HOME/bin/run-example streaming.TwitterPopularTags FlRx3d0n8duIQ0UvGeGtTA DS7TTbxhmQ7oCUlDntpQQRqQllFFOiyNoOMEDD0lA 1643982224-xTfNpLrARoWKxRh9KtFqc7aoB8KAAHkCcfC5vDk PqkbuBqF3AVskgx1OKgXKOZzV7EMWRmRG0p8hvLQYKs

==================================================================

$FLUME_HOME/bin/flume-ng agent -n spark-flume --conf $FLUME_HOME/conf -f $FLUME_HOME/test/spark-flume.conf -Dflume.root.logger=DEBUG,console

telnet localhost 3000

$SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.FlumeEventCount localhost 1234

==================================================================

$SPARK_HOME/bin/run-example streaming.HdfsWordCount file:/home/orienit/spark/input/test

==================================================================
$SPARK_HOME/bin/run-example streaming.QueueStream

==================================================================

$SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewGenerator 44444 10

$SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewStream errorRatePerZipCode localhost 44444

$SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewStream activeUserCount localhost 44444

$SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewStream popularUsersSeen localhost 44444

==================================================================
Usage: KafkaWordCount

$SPARK_HOME/bin/run-example org.apache.spark.examples.streaming.KafkaWordCount localhost:2181 my-consumer-group topic1,topic2 1

==================================================================

$SPARK_HOME/bin/run-example streaming.StatefulNetworkWordCount localhost 9999