19:["$","div",null,{"className":"bg-theme dark:bg-gray-1400 h-dvh w-dvw flex flex-col","children":[["$","a",null,{"href":"#main-content","className":"absolute left-4 top-0 focus-visible:top-4 bg-blue-700 text-white rounded-4 text-xs p-4 z-50 transform -translate-y-full focus-visible:translate-y-0 otransition","children":"Skip to main content"}],"$L36","$L37",["$","div",null,{"role":"main","id":"main-content","className":"relative overflow-y-auto items-center flex flex-col","tabIndex":-1,"children":[["$","div",null,{"className":"max-w-[62.5rem] w-full pt-24 lg:pt-48 px-20 md:px-24","children":["$","$L38",null,{"rootHref":"/learn/course-paths?courseSlug=data-cleaning-and-preprocessing-with-r&unitSlug=data-cleaning-techniques-managing-duplicates-and-outliers-in-r","pathBreadcrumb":{"key":"29","title":"Data Analysis 101 with R","href":"/learn/paths/data-analysis-101-with-r"},"courseBreadcrumb":{"key":"141","title":"Data Cleaning and Preprocessing with R","href":"/learn/courses/data-cleaning-and-preprocessing-with-r","options":[{"key":"139","href":"/learn/courses/data-manipulation-in-r","imageUrl":"https://d3dq4v2xxejk8c.cloudfront.net/uploads/f9fbd669-a743-427d-bfda-d10fd045a7f6_optimized.jpg","numLessons":4,"numPractices":15,"active":false,"label":"Data Manipulation in R","completedAt":"$undefined","completionRatio":"$undefined"},{"key":"140","href":"/learn/courses/statistical-analysis-with-r","imageUrl":"https://d3dq4v2xxejk8c.cloudfront.net/uploads/a6e317d6-ff31-46ff-94b5-5826614dd9bc_optimized.jpg","numLessons":5,"numPractices":21,"active":false,"label":"Statistical Analysis with R","completedAt":"$undefined","completionRatio":"$undefined"},{"key":"141","href":"/learn/courses/data-cleaning-and-preprocessing-with-r","imageUrl":"https://d3dq4v2xxejk8c.cloudfront.net/uploads/50410c93-e151-4d31-b468-cb81c73d439b_optimized.jpg","numLessons":4,"numPractices":17,"active":true,"label":"Data Cleaning and Preprocessing with R","completedAt":"$undefined","completionRatio":"$undefined"},{"key":"142","href":"/learn/courses/advanced-data-transformation-techniques","imageUrl":"https://d3dq4v2xxejk8c.cloudfront.net/uploads/46764c36-17d3-4d95-a1b4-ae3a4ec49947_optimized.jpg","numLessons":5,"numPractices":20,"active":false,"label":"Advanced Data Transformation Techniques","completedAt":"$undefined","completionRatio":"$undefined"},{"key":"143","href":"/learn/courses/hypothesis-testing-in-r","imageUrl":"https://d3dq4v2xxejk8c.cloudfront.net/uploads/5ff913fa-3085-4ba0-8a41-64da8805d79e_optimized.jpg","numLessons":4,"numPractices":15,"active":false,"label":"Hypothesis Testing in R","completedAt":"$undefined","completionRatio":"$undefined"},{"key":"144","href":"/learn/courses/data-visualization-with-ggplot2","imageUrl":"https://d3dq4v2xxejk8c.cloudfront.net/uploads/b9a87c6a-44cf-4e0f-8d3b-577d2af25128_optimized.jpg","numLessons":5,"numPractices":20,"active":false,"label":"Data Visualization with ggplot2","completedAt":"$undefined","completionRatio":"$undefined"}]},"lessonBreadcrumb":{"key":"1946","title":"Data Cleaning Techniques: Managing Duplicates and Outliers in R","options":[{"key":"1945","href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/identifying-and-handling-missing-values-in-r-data-cleaning-process","active":false,"label":"Identifying and Handling Missing Values in R Data Cleaning Process","completedAt":"$undefined"},{"key":"1946","href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/data-cleaning-techniques-managing-duplicates-and-outliers-in-r","active":true,"label":"Data Cleaning Techniques: Managing Duplicates and Outliers in R","completedAt":"$undefined"},{"key":"1947","href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/data-normalization-techniques-in-r","active":false,"label":"Data Normalization Techniques in R","completedAt":"$undefined"},{"key":"1948","href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/categorical-data-encoding-in-r","active":false,"label":"Categorical Data Encoding in R","completedAt":"$undefined"}]}}]}],["$","div",null,{"className":"max-w-[62.5rem] w-full pt-24 pb-60 px-20 md:px-24 space-y-32 md:space-y-48","children":[["$","div",null,{"className":"mx-auto h-[12rem] w-[21.25rem] md:h-[25.5rem] md:w-[45rem] lg:h-[33.5rem] lg:w-[59.5rem] shrink-0","children":["$","$L39",null,{"src":"https://k3-production-bucket.s3.amazonaws.com/uploads/6aba3588-39ed-4062-8125-98d343080ffb_combined_video.mp4","thumbnailUrl":"https://k3-production-bucket.s3.amazonaws.com/uploads/d3d87fb4-09e7-4c02-8920-0adf89847c5d_slide-1.png","mimeType":"video/mp4","speed":1,"forcePause":false}]}],[["$","div","6891",{"className":"space-y-32","children":[["$","div",null,{"className":"text-h-md text-theme-strong","children":"Topic Overview and Actualization"}],["$","div",null,{"className":"space-y-24 text-lg text-theme","children":[["$","p","p-0",{"children":["In today's lesson, we will focus on ",["$","strong","strong-0",{"children":"identifying and handling duplicates and outliers"}]," to clean our dataset for a more precise analysis."]}]]}]]}],["$","div","6892",{"className":"space-y-32","children":[["$","div",null,{"className":"text-h-md text-theme-strong","children":"R Tools for Handling Duplicates"}],["$","div",null,{"className":"space-y-24 text-lg text-theme","children":[["$","p","p-0",{"children":"Consider a dataset containing students' details from a school. If a student's information is repeated in the dataset, we classify that as a duplicate. Duplicates can distort our data, leading to inaccurate results during the analysis."}],"\n",["$","div","pre-0",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"# Create DataFrame\ndf <- data.frame(\n Name = c('John', 'Anna', 'Peter', 'John', 'Anna'), \n Age = c(16, 15, 13, 16, 15), \n Grade = c(9, 10, 7, 9, 10)\n)"}]}],"\n",["$","p","p-1",{"children":"R provides efficient functionalities to handle duplicates in a dataset. Here's how you can identify duplicates:"}],"\n",["$","div","pre-1",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"# Identify duplicates\nprint(df[duplicated(df),])\n# 4 John 16 9\n# 5 Anna 15 10"}]}],"\n",["$","p","p-2",{"children":["The ",["$","code","code-0",{"className":"px-4 py-2 rounded-4 bg-theme-strong dark:bg-theme-medium text-code-lg","style":{"fontVariantLigatures":"none"},"children":"duplicated()"}]," function in R flags duplicate rows. This function can also be used to remove duplicate rows:"]}],"\n",["$","div","pre-2",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"# Remove duplicates\ndf <- df[!duplicated(df),]\nprint(df)"}]}],"\n",["$","p","p-3",{"children":"After removing the duplicates, your data is clean and ready!"}],"\n",["$","div","pre-3",{"className":"my-24","children":["$","$L3a",null,{"language":"$undefined","onClickPlay":"$undefined","display":"block","children":" Name Age Grade\n1 John 16 9\n2 Anna 15 10\n3 Peter 13 7"}]}]]}]]}],["$","div","6893",{"className":"space-y-32","children":[["$","div",null,{"className":"text-h-md text-theme-strong","children":"Identifying Outliers"}],["$","div",null,{"className":"space-y-24 text-lg text-theme","children":[["$","p","p-0",{"children":"An outlier is a data point that is anomalously different from other data points in the same dataset. For instance, in our dataset of primary school students' ages, discovering an age like 98 would be considered an outlier."}],"\n",["$","p","p-1",{"children":["Outliers can be detected visually using tools like box plots and scatter plots, or even through statistical methods such as the Z-score or IQR. Today, we will use the ",["$","em","em-0",{"children":"IQR method"}]," to detect outliers:"]}],"\n",["$","p","p-2",{"children":["Here's a brief reminder: a value is considered an outlier if it is at least ",["$","code","code-0",{"className":"px-4 py-2 rounded-4 bg-theme-strong dark:bg-theme-medium text-code-lg","style":{"fontVariantLigatures":"none"},"children":"1.5 * IQR"}]," less than ",["$","code","code-1",{"className":"px-4 py-2 rounded-4 bg-theme-strong dark:bg-theme-medium text-code-lg","style":{"fontVariantLigatures":"none"},"children":"Q1"}]," (first quartile) or at least ",["$","code","code-2",{"className":"px-4 py-2 rounded-4 bg-theme-strong dark:bg-theme-medium text-code-lg","style":{"fontVariantLigatures":"none"},"children":"1.5 * IQR"}]," greater than ",["$","code","code-3",{"className":"px-4 py-2 rounded-4 bg-theme-strong dark:bg-theme-medium text-code-lg","style":{"fontVariantLigatures":"none"},"children":"Q3"}]," (third quartile)."]}]]}]]}],["$","div","6894",{"className":"space-y-32","children":[["$","div",null,{"className":"text-h-md text-theme-strong","children":"R Tools for Handling Outliers"}],["$","div",null,{"className":"space-y-24 text-lg text-theme","children":[["$","p","p-0",{"children":"Let's use the IQR method in R. First, let's define our dataset:"}],"\n",["$","div","pre-0",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"# Create dataset\ndf <- data.frame(\n students = c('Alice', 'Bob', 'John', 'Ann', 'Rob'),\n scores = c(56, 11, 50, 98, 47)\n)"}]}],"\n",["$","p","p-1",{"children":"Now, let's compute the IQR, Q1, Q3, and detect outliers:"}],"\n",["$","div","pre-1",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"# Compute Q1, Q3, and IQR\nIQR_scores <- IQR(df$scores) # 9\nQ1_scores <- quantile(df$scores, 0.25) # 47\nQ3_scores <- quantile(df$scores, 0.75) # 56\n\n# Lower and Upper Bounds\nlower_bound <- Q1_scores - 1.5 * IQR_scores # 33.5\nupper_bound <- Q3_scores + 1.5 * IQR_scores # 69.5\n\n# Detect outliers\noutliers <- df[(df$scores < lower_bound) | (df$scores > upper_bound),]\nprint(outliers)"}]}],"\n",["$","p","p-2",{"children":"Here is the output:"}],"\n",["$","div","pre-2",{"className":"my-24","children":["$","$L3a",null,{"language":"$undefined","onClickPlay":"$undefined","display":"block","children":" students scores\n2 Bob 11\n4 Ann 98"}]}]]}]]}],["$","div","6895",{"className":"space-y-32","children":[["$","div",null,{"className":"text-h-md text-theme-strong","children":"Handling Outliers: Removal"}],["$","div",null,{"className":"space-y-24 text-lg text-theme","children":[["$","p","p-0",{"children":"There are generally two strategies for dealing with outliers — removing them or replacing them with a median value."}],"\n",["$","p","p-1",{"children":"Removing outliers is the most straightforward method. However, you might opt for other methods as removing outliers can result in data loss. To apply it, let's reverse the condition to choose everything except outliers."}],"\n",["$","div","pre-0",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"# Remove outliers from data\ndf <- df[(df$scores >= lower_bound & df$scores <= upper_bound),]\nprint(df)"}]}],"\n",["$","p","p-2",{"children":"There is a resulting data, no outliers included!"}],"\n",["$","div","pre-1",{"className":"my-24","children":["$","$L3a",null,{"language":"$undefined","onClickPlay":"$undefined","display":"block","children":" students scores\n1 Alice 56\n3 John 50\n5 Rob 47"}]}]]}]]}],["$","div","6896",{"className":"space-y-32","children":[["$","div",null,{"className":"text-h-md text-theme-strong","children":"Handling Outliers: Replacement"}],["$","div",null,{"className":"space-y-24 text-lg text-theme","children":[["$","p","p-0",{"children":"Alternatively, outliers can be replaced with median values. The median value is less susceptible to outliers and hence suitable for replacement."}],"\n",["$","div","pre-0",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"# Replace outliers with median scores\nmedian_score <- median(df$scores)\ndf$scores[df$scores < lower_bound | df$scores > upper_bound] <- median_score\nprint(df)"}]}],"\n",["$","p","p-1",{"children":["Here, we select outliers using boolean selection and make them equal to the median score. The median is ",["$","code","code-0",{"className":"px-4 py-2 rounded-4 bg-theme-strong dark:bg-theme-medium text-code-lg","style":{"fontVariantLigatures":"none"},"children":"50"}],", hence outlier scores are replaced with ",["$","code","code-1",{"className":"px-4 py-2 rounded-4 bg-theme-strong dark:bg-theme-medium text-code-lg","style":{"fontVariantLigatures":"none"},"children":"50"}],":"]}],"\n",["$","div","pre-1",{"className":"my-24","children":["$","$L3a",null,{"language":"$undefined","onClickPlay":"$undefined","display":"block","children":" students scores\n1 Alice 56\n2 Bob 50\n3 John 50\n4 Ann 50\n5 Rob 47"}]}]]}]]}],["$","div","6897",{"className":"space-y-32","children":[["$","div",null,{"className":"text-h-md text-theme-strong","children":"Handling Outliers: Replacement with Mean"}],["$","div",null,{"className":"space-y-24 text-lg text-theme","children":[["$","p","p-0",{"children":"An alternative to replacing outliers with the median is using the dataset's mean, excluding the outliers. This method ensures that the replacement value reflects the central tendency of the main distribution of data without being skewed by the extreme values."}],"\n",["$","p","p-1",{"children":"First, we need to calculate the mean of the data, excluding the outliers:"}],"\n",["$","div","pre-0",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"# Calculating mean without outliers\nmean_scores <- mean(df$scores[(df$scores >= lower_bound & df$scores <= upper_bound)])"}]}],"\n",["$","p","p-2",{"children":"Then, replace the outliers with this mean value:"}],"\n",["$","div","pre-1",{"className":"my-24","children":["$","$L3a",null,{"language":"r","onClickPlay":"$undefined","display":"block","children":"df$scores[df$scores < lower_bound | df$scores > upper_bound] <- mean_scores"}]}],"\n",["$","p","p-3",{"children":"This approach replaces outliers with a mean score that is representative of the bulk of the data, ensuring a more balanced dataset:"}],"\n",["$","div","pre-2",{"className":"my-24","children":["$","$L3a",null,{"language":"$undefined","onClickPlay":"$undefined","display":"block","children":" students scores\n1 Alice 56.0\n2 Bob 51.0\n3 John 50.0\n4 Ann 51.0\n5 Rob 47.0"}]}],"\n",["$","p","p-4",{"children":["Note that the mean value ",["$","code","code-0",{"className":"px-4 py-2 rounded-4 bg-theme-strong dark:bg-theme-medium text-code-lg","style":{"fontVariantLigatures":"none"},"children":"51"}]," (rounded for simplicity) is calculated without the outliers, offering a more accurate depiction of the central value of most data points."]}]]}]]}],["$","div","6898",{"className":"space-y-32","children":[["$","div",null,{"className":"text-h-md text-theme-strong","children":"Summary"}],["$","div",null,{"className":"space-y-24 text-lg text-theme","children":[["$","p","p-0",{"children":"This lesson discussed what duplicates and outliers are, their implications on data analysis, and how to handle them using R. The key to accurate data analysis is clean data. Now is the best time to apply these concepts to real-world data! Let's dive into some practical exercises!"}]]}]]}]],["$","div",null,{"className":"flex gap-16 justify-between md:justify-start","children":[[["$","div",null,{"className":"hidden md:block","children":["$","$L10",null,{"href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/identifying-and-handling-missing-values-in-r-data-cleaning-process","variant":"tertiary","size":"sm","LeadingIcon":"$3b","children":"Previous Lesson"}]}],["$","div",null,{"className":"md:hidden","children":["$","$L10",null,{"href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/identifying-and-handling-missing-values-in-r-data-cleaning-process","variant":"tertiary","size":"sm","LeadingIcon":"$3b","children":"Previous"}]}]],[["$","div",null,{"className":"hidden lg:block","children":["$","$L10",null,{"href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/data-normalization-techniques-in-r","variant":"tertiary","size":"sm","TrailingIcon":"$3c","maxWidth":"47rem","children":"Next Lesson: Data Normalization Techniques in R"}]}],["$","div",null,{"className":"hidden md:block lg:hidden","children":["$","$L10",null,{"href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/data-normalization-techniques-in-r","variant":"tertiary","size":"sm","TrailingIcon":"$3c","maxWidth":"33rem","children":"Next Lesson: Data Normalization Techniques in R"}]}],["$","div",null,{"className":"md:hidden","children":["$","$L10",null,{"href":"/learn/courses/data-cleaning-and-preprocessing-with-r/lessons/data-normalization-techniques-in-r","variant":"tertiary","size":"sm","TrailingIcon":"$3c","children":"Next"}]}]]]}]]}],["$","div",null,{"className":"flex flex-col bg-gray-200 dark:bg-gray-1400 w-full px-20 md:px-24 pt-24 pb-48 items-center","children":["$","div",null,{"className":"max-w-screen-xl w-full flex flex-col lg:flex-row py-20 px-24 items-center justify-between gap-48 lg:gap-60 xl:gap-96","children":[["$","$L3d",null,{"alt":"Sign up","src":"/learn/img/signup-module.png","className":"h-auto w-[335px] md:w-[440px] lg:w-[480px] xl:w-[570px]","width":570,"height":336}],["$","div",null,{"className":"flex flex-col gap-32 ml-24 items-center lg:items-start","children":[["$","div",null,{"className":"text-h-md md:text-h-xl text-theme-strong text-center lg:text-start","children":"Join the 1M+ learners on CodeSignal"}],["$","div",null,{"className":"text-lg font-normal md:text-2xl text-theme text-center lg:text-start md:max-w-[536px] lg:max-w-none","children":"Be a part of our community of 1M+ users who develop and demonstrate their skills on CodeSignal"}],["$","$L35",null,{"size":"lg","pageType":"lessonPreview","uiRegion":"joinModule","children":"Start learning today!"}]]}]]}]}],"$L3e"]}]]}]