Practical 1: Write a program in Map Reduce for WordCount operation.
WordCount.java ( Create a data.txt )
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
private Text word = new Text();
public void map(Object key, Text value, Context context )
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, new IntWritable(1));
(1,1)
}
}
}
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,Context context) throws
IOException, InterruptedException {
int sum = 0;
for (IntWritable x : values) { sum += x.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class); // mentioning Main class
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
data1.txt(input data)
how are you where are you
Steps to run program :-
start hadoop
start-dfs.sh
start-yarn.sh
hadoop com.sun.tools.javac.Main WordCount.java
ls -l
hdfs dfs -ls /
hdfs dfs -rm -r /wordcount
jar cf wc.jar WordCount*.class
http://localhost:9870/explorer.html#/ localhost la check kara file ahe ka tikde.
hdfs dfs -mkdir -p /wordcount/input
hdfs dfs -copyFromLocal data1.txt /wordcount/input
hadoop jar wc.jar WordCount /wordcount/input /wordcount/output
hdfs dfs -cat /wordcount/output/part-r-00000
Practical 2: Write a program in Map Reduce for Matrix Multiplication.
MatrixMultiply.java
import org.apache.hadoop.conf.*; import org.apache.hadoop.fs.Path; import
org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class MatrixMultiply {
public static void main(String[] args) throws Exception {
if (args.length != 2) { System.err.println("Usage: MatrixMultiply <in_dir> <out_dir>");
System.exit(2);
}
Configuration conf = new Configuration();
conf.set("n", "100");
conf.set("p", "1000");
@SuppressWarnings("deprecation")
Job job = new Job(conf, "MatrixMultiply");
job.setJarByClass(MatrixMultiply.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class); job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Map.java
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class Map
extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, Text> {
@Override
public void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
int m = Integer.parseInt(conf.get("m"));
int p = Integer.parseInt(conf.get("p"));
String line = value.toString();
String[] indicesAndValue = line.split(",");
Text outputKey = new Text();
Text outputValue = new Text();
if (indicesAndValue[0].equals("M")) {
for (int k = 0; k < p; k++) {
outputKey.set(indicesAndValue[1] + "," + k);
outputValue.set(indicesAndValue[0] + "," + indicesAndValue[2] + "," +
indicesAndValue[3]);
context.write(outputKey, outputValue);
}
} else {
for (int i = 0; i < m; i++) {
outputKey.set(i + "," + indicesAndValue[2]);
outputValue.set("N," + indicesAndValue[1] + "," + indicesAndValue[3]);
context.write(outputKey, outputValue);
}
}
}
}
Reduce.java
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashMap;
public class Reduce
extends org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text> {
@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
InterruptedException {
String[] value;
HashMap<Integer, Float> hashA = new HashMap<Integer, Float>();
HashMap<Integer, Float> hashB = new HashMap<Integer, Float>();
for (Text val : values) {
value = val.toString().split(",");
if (value[0].equals("M")) {
hashA.put(Integer.parseInt(value[1]), Float.parseFloat(value[2]));
} else {
hashB.put(Integer.parseInt(value[1]), Float.parseFloat(value[2]));
}
}
int n = Integer.parseInt(context.getConfiguration().get("n"));
float result = 0.0f;
float m_ij;
float n_jk;
for (int j = 0; j < n; j++) {
m_ij = hashA.containsKey(j) ? hashA.get(j) : 0.0f;
n_jk = hashB.containsKey(j) ? hashB.get(j) : 0.0f;
result += m_ij * n_jk;
}
if (result != 0.0f) {
context.write(null,
new Text(key.toString() + "," + Float.toString(result)));
}
}
}
Ek file banva matrixA.txt nava chi tyat he taka
M,0,0,12
M,0,1,13
M,1,0,14
M,1,1,15
Ek file banva matrixB.txt nava chi tyat he taka
N,0,0,11
N,0,1,13
N,1,0,14
N,1,1,19
code:-
start-dfs.sh
start-yarn.sh
hadoop com.sun.tools.javac.Main MatrixMultiply.java Map.java Reduce.java
jar cf mm.jar *.class
ls –l
hdfs dfs -mkdir /MatrixMultiply
hdfs dfs -mkdir /MatrixMultiply/input
hdfs dfs -ls /
hdfs dfs -copyFromLocal matrixA.txt matrixB.txt /MatrixMultiply/input
hadoop jar mm.jar MatrixMultiply /MatrixMultiply/input /MatrixMultiply/output
hdfs dfs -cat /MatrixMultiply/output/part-r-00000
MONGODB
CmdPrompt1-type : mongod
2nd cmd PmptType: mongosh
Practical 2: Sample Database Creation
Start cmd -> mongod
Start a new cmd -> mongosh
show dbs
use tanvi ( can use any name )
Practical 3: Query the Sample Database using MongoDB querying commands
db.createCollection("student")
db.student.insertOne({name: "Tanvi Tawade", rollno:61, div:"A"})
db.student.insertMany([{name: "Namrata Gaikwad", rollno:12, div: "B"},
{name: "Omkar Daifale", rollno:10, div:"A"},
{name: "Chinmay Warang", rollno:69, div:"A"},
{name: "Shreya Nikam", rollno:33, div:"B"},
{name: "Pratiksha Majrekar", rollno:31, div:"A"}, (ekach snippet code ahe )
{name: "Heth Shah", rollno:52, div:"B"},
{name: "Ketan Bhoir", rollno:6, div:"B"},
{name: "Uday Gavada", rollno:16, div:"A"},
{name: "Prathmesh Patil", rollno:38, div:"B"},
{name: "Swaraj Wadkar", rollno:67, div:"A"}])
db.student.find({})
db.student.find().pretty()
db.student.findOne({name:"Tanvi Tawade"})
db.student.find({name: {$in:["Tanvi Tawade", "Swaraj Wadkar"]}})
db.student.find({$and:[{name:"Tanvi Tawade"},{rollno:61}]})
db.student.find({$or:[{name:"Tanvi Tawade"},{rollno:31}]})
db.student.find({rollno:{$lt:62}, $or:[{name:"Tanvi Tawade"},{div:"A"}] })
db.student.find({rollno:{$lt:62}, $or:[{name:"Tanvi Tawade"},{div:"B"}] })
db.student.find({$or:[{name:/^C/},{name:/^T/}]})
db.student.find({$nor:[{name:"Swaraj Wadkar"},{div:"B"}]})
db.student.find({name:"Heth Shah"})
db.student.update({name:"Heth Shah"},{$set: {div:"A"}})
db.student.insertMany([{name: "Namrata Gaikwad", rollno:12, div: "B"},
{name: "Omkar Daifale", rollno:10, div:"A"},
{name: "Shreya Nikam", rollno:33, div:"B"}])
db.student.updateMany({div:"B"},{$set:{div:"A"}})
db.student.findOneAndUpdate({name:"Namrata Gaikwad"},{$set:{div:"B",rollno:13} })
db.student.deleteOne({rollno:38})
db.student.deleteMany({$or: [{rollno:{$lt:30}},{div:"B"}]})
db.student.createIndex({name:1, rollno:1},{name: "idx_name_rollno"})
HIVE
Practical 3: Create Database & Table in Hive
To start hive Go to /home/hadoop/apache-hive-3.1.2-bin
start-dfs.sh
start-yarn.sh
hive
create database tanvi;
show databases;
use tanvi;
create table student(rno int, name string,section string, marks int);
show tables;
insert into table student values(61,'Tanvi', 'A', 83);
select * from student;
insert into table student values(12, 'Namrata', 'B', 54), (10,'Omkar','A',53),
(31,'Pratiksha','A',89),(33,'Shreya','B',23),(6,'Ketan','B',47),(69,'Chinmay','B',59),
(16,'Uday','A',78),(52,'Heth','B',68),(38,'Prathmesh','B',48), (67,'Swaraj','A',56);
Practical 4: Hive Partitioning
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode;
set hive.exec.dynamic.partition.mode=nonstrict;
Create a file Student.txt
61,Tanvi,A,83
12,Namrata,B,54
10,Omkar,A,53
31,Pratiksha,A,89
33,Shreya,B,23
6,Ketan,B,47
69,Chinmay,B,59
16,Uday,A,78
52,Heth,B,68
38,Prathmesh,B,48
67,Swaraj,A,56
create table student_part(rno int, name string,marks int)
partitioned by(section string)
row format delimited fields terminated by ',' ;
LOAD DATA LOCAL INPATH '/home/hadoop/hive/student.txt' INTO TABLE
student_part;
DESCRIBE FORMATTED student_part;
SELECT COUNT(*) FROM student_part WHERE section = 'A';
Practical 7: Hive Views and Indexes
CREATE VIEW emp_view AS SELECT * FROM employee WHERE salary>60000;
select * from emp_view;
drop view emp_view;
Practical 8: HiveQL : Select Where, Select OrderBy, Select GroupBy, Select Joins
Create a text file emp.txt
61,Tanvi,Manager,83000
12,Namrata,Developer,54000
10,Omkar,Tester,53000
31,Pratiksha,Manager,89000
33,Shreya,Developer,23000
6,Ketan,Tester,47000
69,Chinmay,B,59000
16,Uday,Tester,78000
52,Heth,Developer,68000
38,Prathmesh,Developer,48000
67,Swaraj,Tester,56000
CREATE TABLE tanvi.employee ( empcode INT,ename STRING, job STRING, salary
INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/home/hadoop/hive/emp.txt' INTO TABLE employee;
select * from employee;
select count(*) from employee;
select avg(salary) from employee;
ALTER TABLE employee RENAME TO emp;
Create a emp.txt
61,Tanvi,1,83000
12,Namrata,3,54000
10,Omkar,2,53000
31,Pratiksha,1,89000
33,Shreya,2,23000
6,Ketan,2,47000
69,Chinmay,3,59000
16,Uday,2,78000
52,Heth,3,68000
38,Prathmesh,3,48000
67,Swaraj,2,56000
37,Rupali,2,66000
CREATE TABLE tanvi.employee ( empcode INT,ename STRING, dno INT,salary INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/home/hadoop/hive/emp.txt' INTO TABLE employee;
select * from employee;
Create a dept..txt
1,Manager,Mumbai
2,Tester,Pune
3,Developer,Delhi
CREATE TABLE tanvi.department ( dno INT, dname STRING, loacation STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/home/hadoop/hive/dept.txt' INTO TABLE department;
select * from department;
select * from employee e, department d where e.dno=d.dno;
select count(*) from employee group by dno;
select count(*) from employee e ,department d where e.dno=d.dno and d.dname='Manager';
ALTER TABLE employee ADD COLUMNS (dept STRING COMMENT 'Department
name');
SELECT dno, COUNT(*) FROM tanvi.employee GROUP BY dno;
SELECT e.empcode,e.ename,e.dno,d.loacation,d.dname FROM employee e JOIN
department d ON (e.dno=d.dno);
SELECT e.empcode,e.ename,e.dno,d.loacation,d.dname FROM employee e LEFT OUTER
JOIN department d ON (e.dno=d.dno);
SELECT e.empcode,e.ename,e.dno,d.loacation,d.dname FROM employee e RIGHT OUTER
JOIN department d ON (e.dno=d.dno);
SELECT e.empcode,e.ename,e.dno,d.loacation FROM employee e FULL OUTER JOIN
department d ON (e.dno=d.dno);
PIG
Practical 2: Pig Latin Basic
1. Display total number of students
Create a student.txt
61, Tanvi Tawade, maths, 85
61, Tanvi Tawade, aiml, 90
61, Tanvi Tawade, dscc, 78
12, Namrata Gaikwad, maths, 75
12, Namrata Gaikwad, aiml, 82
12, Namrata Gaikwad, dscc, 90
10, Omkar Daifale, maths, 92
10, Omkar Daifale, aiml, 88
10, Omkar Daifale, dscc, 76
69, Chinmay Warang, maths, 80
69, Chinmay Warang, aiml, 85
69, Chinmay Warang, dscc, 92
33, Shreya Nikam, maths, 88
33, Shreya Nikam, aiml, 78
33, Shreya Nikam, dscc, 85
31, Pratiksha Majrekar, maths, 76
31, Pratiksha Majrekar, aiml, 90
31, Pratiksha Majrekar, dscc, 82
52, Heth Shah, maths, 90
52, Heth Shah, aiml, 85
52, Heth Shah, dscc, 88
6, Ketan Bhoir, maths, 82
6, Ketan Bhoir, aiml, 76
6, Ketan Bhoir, dscc, 90
16, Uday Gavada, maths, 85
16, Uday Gavada, aiml, 92
16, Uday Gavada, dscc, 78
38, Prathmesh Patil, maths, 78
38, Prathmesh Patil, aiml, 85
38, Prathmesh Patil, dscc, 90
67, Swaraj Wadkar, maths, 92
67, Swaraj Wadkar, aiml, 80
67, Swaraj Wadkar, dscc, 86
stud = LOAD 'student.txt' using PigStorage(',') AS (rno: int , name : chararray , sub :
chararray, mark : int);
dump stud;
Describe stud;
{rno: int , name : chararray , sub : chararray, mark : int}
A = group stud all;
dump A;
B = foreach A generate COUNT(stud);
dump B;
2. Display subject wise student count
A = group stud by sub;
dump A;
B = foreach A generate COUNT(stud);
dump B;
B = foreach A generate AVG(stud.mark);
dump B;
B = foreach A generate stud.sub, AVG(stud.mark);
dump B;
B = foreach A generate stud.name, AVG(stud.mark);
dump B;
B = foreach A generate stud.name, SUM(stud.mark);
dump B;
B = foreach A generate stud.sub, SUM(stud.mark);
dump B;
B = foreach A generate stud.name, SUM(stud.mark);
dump B;
B = foreach A generate stud.name, MAX(stud.mark);
dump B;
B = foreach A generate MAX(stud.mark);
dump B;
B = foreach A generate stud.name, MIN(stud.mark);
dump B;
Practical 4: Download the data
pig -x local
Create a student.txt
61, Tanvi, Tawade, 22, 9766543210, Mumbai
12, Namrata, Gaikwad, 23, 9876543210, Mumbai
10, Omkar, Daifale, 22, 8765432109, Bangalore
69, Chinmay, Warang, 24, 7654321098, Delhi
33, Shreya, Nikam, 21, 6543210987, Mumbai
31, Pratiksha, Majrekar, 25, 5432109876, Hyderabad
52, Heth, Shah, 23, 4321098765, Bangalore
6, Ketan, Bhoir, 24, 3210987654, Mumbai
16, Uday, Gavada, 22, 2109876543, Delhi
38, Prathmesh, Patil, 21, 1098765432, Hyderabad
67, Swaraj, Wadkar, 25, 9876543210, Chennai
student1 = LOAD 'student.txt' using PigStorage(',') AS (rno:chararray, fname:chararray,
lname:chararray, age:int, phone:int, city:chararray);
dump student1;
STORE student1 into 'student_output.txt' using PigStorage('|');
Practical 5: Create your Script
1. Write the following pig latin commands in a file called student_data.pig.
emp = load 'emp.txt' using PigStorage(',') AS (eid:chararray, name:chararray,
designation:chararray, deptid:chararray, salary:int);
STORE emp into 'emp_output.txt' using PigStorage(',');
ss = FOREACH emp GENERATE eid, name, deptid;
dump ss;
Practical 6: Save and Execute the Script
2. Execute the Apache Pig script using the following command.
pig -x local emp_data.pig
exec empnew.pig
run emp_data.pig
Practical 7: Pig Operations : Diagnostic Operators, Grouping and Joining, Combining
& Splitting, Filtering, Sorting
Crreate a student.txt
61, Tanvi, Tawade, 22, 9766543210, Mumbai
12, Namrata, Gaikwad, 23, 9876543210, Mumbai
10, Omkar, Daifale, 22, 8765432109, Bangalore
69, Chinmay, Warang, 24, 7654321098, Delhi
33, Shreya, Nikam, 21, 6543210987, Mumbai
31, Pratiksha, Majrekar, 25, 5432109876, Hyderabad
52, Heth, Shah, 23, 4321098765, Bangalore
6, Ketan, Bhoir, 24, 3210987654, Mumbai
16, Uday, Gavada, 22, 2109876543, Delhi
38, Prathmesh, Patil, 21, 1098765432, Hyderabad
67, Swaraj, Wadkar, 25, 9876543210, Chennai
student1 = LOAD 'student.txt' using PigStorage(',') AS (rno:chararray, fname:chararray,
lname:chararray, age:int, phone:int, city:chararray);
a. Diagnostic Operators
dump student1;
describe student1;
explain student1;
stud_11 = FILTER student1 BY age < 23;
dump stud_11;
C = FOREACH student1 GENERATE rno, fname, city;
dump C;
illustrate C;
b. Grouping and Joining
stud_1 = GROUP student1 BY city;
dump stud_1;
describe stud_1;
stud_2 = GROUP student1 BY (city,age);
dump stud_2;
describe stud_2;
1. self join
A = LOAD 'student.txt' using PigStorage(',') AS (rno:chararray, fname:chararray,
lname:chararray, age:int, phone:int, city:chararray);
B = LOAD 'student.txt' using PigStorage(',') AS (rno:chararray, fname:chararray,
lname:chararray, age:int, phone:int, city:chararray);
C = JOIN A BY age, B BY age;
dump C;
2. Inner Join (equijoin)- An inner join returns rows when there is a match in both
tables.
Create a emp.txt
61, Tanvi, Manager, 1, 83000
12, Namrata, Quality Assurance, 3, 54000
10, Omkar, Engineering, 2, 53000
31, Pratiksha, Manager, 1, 89000
33, Shreya, Testing, 2, 23000
6, Ketan, Testing, 2, 47000
69, Chinmay, Quality Assurance, 3, 59000
16, Uday, Testing, 2, 78000
52, Heth, Quality Assurance, 3, 68000
38, Prathmesh, Quality Assurance, 3, 48000
67, Swaraj, Testing, 2, 56000
37, Rupali, Testing, 2, 66000
emp= LOAD 'emp.txt' using PigStorage(',') AS (eid:chararray, name:chararray,
designation:chararray, deptid:chararray, salary:int);
dump emp;
Create a dept.txt
1, Finance
2, Testing
3, Quality Assurance
dept= LOAD 'dept.txt' using PigStorage(',') AS (deptid:chararray, dname:chararray);
dump dept;
emp_dept_innerjoin = JOIN emp BY deptid, dept BY deptid;
dump emp_dept_innerjoin;
3. LEFT join
emp_dept_left = JOIN emp BY deptid LEFT, dept BY deptid;
dump emp_dept_left ;
4. RIGHT JOIN
emp_dept_right = JOIN emp BY deptid RIGHT, dept BY deptid;
dump emp_dept_right;
5. FULL outer join
emp_dept_full= JOIN emp BY deptid FULL OUTER, dept BY deptid;
dump emp_dept_full ;
Cross Product
cross_prod = CROSS emp, dept;
dump cross_prod;
c. Combining & Splitting
SPLIT emp into sal1 if salary<54000, sal2 if salary>=54000;
dump sal1;
dump sal2;
d. Filtering, Sorting
filter_designation = FILTER emp BY designation == 'manager';
dump filter_designation;
Order by
S = order emp by name desc;
dump S;
S = order emp by name asc;
dump S;
SPARK
Practical 2: Downloading Data Set and Processing it Spark
spark-shell
val mydfT = spark.read.csv("/home/hadoop/SparkT/student.csv")
mydfT.printSchema()
mydfT.show
mydfT.createOrReplaceTempView("BVIMIT")
val mydf2 = spark.sql("SELECT * FROM BVIMIT")
mydf2.show()
val mydf2 = spark.sql("describe BVIMIT")
mydf2.show
val mydf2 = spark.sql("SELECT * FROM BVIMIT where _c1 > 50")
mydf2.show
Step 1 : Create dataframe from json file
val df=spark.read.json("/home/hadoop/SparkT/student.json")
if error is showwing then use this
val df1=spark.read.option("multiline","true").json("/home/hadoop/SparkT/student.json")
df1.show()
df1.printSchema()
df1.select("name").show()
df1.select(("name"),("div")).show()
OR
df1.select(df1.col("name"), df1.col("div")).show()
df1.filter(df1.col("rollno") >50).show()
df1.groupBy("div").count().show()
df1.createOrReplaceTempView("people2")
val sqlDF1 = spark.sql("SELECT * FROM people2")
sqlDF1.show
df1.write.csv("output")
Practical 3: Word Count in Apache Spark.
Create a spark-wc.txt
As we all know, a paragraph is a group of sentences that are connected and make absolute
sense. While writing a long essay or letter, we break them into paragraphs for better
understanding and to make a well-structured writing piece.
val data3=sc.textFile("/home/hadoop/SparkT/spark-wc.txt")
data3.collect
val splitdata=data3.flatMap(line=>line.split(" "));
splitdata.collect;
val mapdata=splitdata.map(word=>(word,1));
mapdata.collect
val reducedata=mapdata.reduceByKey(_+_);
reducedata.collect