35:["$","$L2c",null,{"className":"content-sidebar paddingleft blogCat","lg":4,"md":5,"sm":4,"children":["$","div",null,{"className":"sticky-sidebar-wrapper","children":["$","$L36",null,{"contentData":{"canonical":"https://developer.ibm.com/blogs/spark-performance-optimization-guidelines","updated_date":"2024-05-17T00:00:00","publish_date":"2020-06-30T00:00:00","check_date":"2029-01-31","archive_date":null,"full_slug":"spark-performance-optimization-guidelines","slug":"spark-performance-optimization-guidelines","content_type":"blogs","subtype":null,"lang":"en","repo_group":"default","title":"Explore best practices for Spark performance optimization","subtitle":"Tips and goals for developing Spark apps","excerpt":"Learn some performance optimization tips to keep in mind when developing your Spark applications.","contributors":[{"name":"Sunitha Kambhampati","email":"skambha@us.ibm.com","hasAuthorProfile":false}],"editors":null,"tags":null,"primary_category":"spark","categories":["open-source","data-science","machine-learning","artificial-intelligence","spark"],"ibm_components":["spark"],"content":"$37","navigation_items":[{"href":"#spark-characteristics","label":"Spark characteristics"},{"href":"#lazy-loading-behavior","label":"Lazy loading behavior"},{"href":"#file-formats","label":"File formats"},{"href":"#parallelism","label":"Parallelism"},{"href":"#reduce-shuffle","label":"Reduce shuffle"},{"href":"#filter-reduce-dataset-size","label":"Filter/Reduce dataSet size"},{"href":"#cache-appropriately","label":"Cache appropriately"},{"href":"#join","label":"Join"},{"href":"#tune-cluster-resources","label":"Tune cluster resources"},{"href":"#avoid-expensive-operations","label":"Avoid expensive operations"},{"href":"#data-skew","label":"Data skew"},{"href":"#udfs","label":"UDFs"}],"code_sb_navigation":"

Spark characteristics

\n","code_sb_resources":null,"code_sb_related":null,"code_sb_recommended":null,"images":{"card_image":"","header_image":"","highlight_image":"","icon_image":"","social_image":"","square_image":""},"meta":{"meta_description":"Learn some performance optimization tips to keep in mind when developing your Spark applications.","meta_keywords":"apache spark","meta_tags":"null","meta_title":"Explore best practices for Spark performance optimization","og_meta":"null","twitter_meta":"null"},"github_links":null,"demo_links":null,"podcast_sources":null,"episode_source":null,"pagepost_custom_css_value":null,"pagepost_custom_js_value":null,"conference_id":null,"conference_registration":null,"has_registration":null,"conference_start_date":null,"conference_end_date":null,"session_start_date":null,"has_tracks":null,"menu_order":null,"conference_node_type":null,"disable_replay":null,"conference_start_date_offset":null,"conference_ui":null,"event_start_date":null,"event_end_date":null,"event_survey_url":null,"timezone_offset":null,"event_location":null,"event_url":null,"partners":null,"event_address":null,"embedded_slug":null,"video_service":null,"video_id":null,"video_id_type":null,"video_chat":null,"exhibit_type":null,"solutions_for":null,"featured_content_slugs":null,"also_found_in":null,"series_toc":null,"timezone_iana_zone":null,"dataset_format":null,"dataset_license_title":null,"ui_data":{"code_card_displayname":"","code_sb_navigation":"

Spark characteristics

\n","code_sb_recommended":null,"code_series_toc":"","collections":null,"component_links":null,"component_type":null,"course_duration":null,"course_url":null,"dataset_format":"","demo_links":null,"disable_replay":false,"episode_duration":null,"episode_source":null,"event_address":{"event_address_1":"","event_address_2":"","event_address_3":"","event_city":"","event_state_province":"","event_zip_postal_code":"","event_country":""},"event_location":"","event_survey_url":null,"event_url":null,"featured_content_slugs":null,"getting_started_guide":null,"is_portal_with_slack":false,"join_slack_message":null,"newportal":false,"slack_workspace_url":null},"found_in_portals":null,"is_private":"false","ibmcode_private":null,"audio_file_url":null,"listening":null,"time_to_read":"8","like_count":0,"dislike_count":0,"children":null,"left_nav":{"content_types":["blogs","tutorials"],"related_topics":[{"slug":"machine-learning","name":"Machine Learning","type":"technologies"},{"slug":"artificial-intelligence","name":"Artificial intelligence","type":"technologies"},{"slug":"data-science","name":"Data science","type":"technologies"},{"slug":"analytics","name":"Analytics","type":"technologies"}],"definition":{"system_id":15,"system_created_date":"2022-04-02T00:02:50.869966+00:00","system_updated_date":"2026-07-08T13:23:55.696028+00:00","slug":"spark","language":"en","taxonomy":true,"type":"components","name":"Apache Spark","description":"Simplify the challenging and computationally intensive task of processing high volumes of real-time data.","featured_content_heading":null,"all_content_heading":null,"template":"component","excerpt":"An open-source analytics engine for large-scale data analytics","more_resources":null,"featured_content":null,"solutions_sections":null,"featured":false,"hide_hub":false,"adv_tool":true,"contribute_form":false,"cc_labels":["Open Source","Analytics","Data-&-AI"],"subtype":"opensource","product_link":[{"label":"Get Apache Spark","url":"https://spark.apache.org/"}],"getting_started_guide":null,"previous_slug":null,"experts":null,"solutions_for_header":null,"links":null,"trial_link":null,"deactivate":null,"strategic_content_areas":["data"],"title":null,"section":null,"url":null,"items":null,"section_order":null,"action_button_text":null,"action_button_link":null,"image_url":null,"section_item_order":null,"style":null,"ignore_prod":null,"summary":null,"action_buttons":null,"delete":null},"strategic_content_areas":["data","generative-ai","open-source"]}},"categories":"$35:props:children:props:children:props:contentData:categories","taxonomies":{"solutions":{"name":"Solutions","type":"tagType"},"maximo":{"name":"IBM Maximo Application Suite","type":"components"},"redhat-ansible":{"name":"Red Hat Ansible Automation Platform","type":"components"},"hashicorp-vault":{"name":"HashiCorp Vault","type":"components"},"topics":{"name":"Topics","type":"tagType"},"rag":{"name":"RAG","type":"technologies"},"docling":{"name":"Docling","type":"components"},"ibm-bob":{"name":"IBM Bob","type":"components"},"spark":{"name":"Apache Spark","type":"components"},"ci-cd":{"name":"CI/CD","type":"devpractices"},"watsonx-data":{"name":"watsonx.data","type":"components"},"devops":{"name":"DevOps","type":"devpractices"},"quantum-safe":{"name":"Quantum safe","type":"devpractices"},"paas":{"name":"Platform as a service","type":"technologies"},"reactive-systems":{"name":"Reactive systems","type":"depmodels"},"quantum-computing":{"name":"Quantum computing","type":"depmodels"},"artificial-intelligence":{"name":"Artificial intelligence","type":"technologies"},"ibm-cloud-paks":{"name":"IBM Cloud Paks","type":"components"},"qradar":{"name":"IBM Security QRadar Suite","type":"components"},"edge-computing":{"name":"Edge computing","type":"depmodels"},"section-item-courses":{"name":null,"type":"home-page-configuration"},"home-page-hero-banner":{"name":null,"type":"home-page-configuration"},"cloud-pak-for-integration":{"name":"IBM Cloud Pak for Integration","type":"components"},"natural-language-processing":{"name":"Natural language processing","type":"technologies"},"front-end-development":{"name":"Front-end development","type":"technologies"},"blockchain":{"name":"Blockchain","type":"technologies"},"langchain":{"name":"Langchain","type":"components"},"data-privacy":{"name":"Data privacy","type":"technologies"},"section-item-tutorials":{"name":null,"type":"home-page-configuration"},"open-j9":{"name":"Eclipse OpenJ9","type":"components"},"depmodels":{"name":"Architectures and deployment models","type":"tagType"},"beeai":{"name":"BeeAI","type":"components"},"instana":{"name":"IBM Instana","type":"components"},"ibmz":{"name":"IBM Z and z/OS","type":"components"},"watsonx-ai":{"name":"watsonx.ai","type":"components"},"deep-learning":{"name":"Deep learning","type":"technologies"},"ibm-db2-warehouse":{"name":"IBM Db2 Warehouse","type":"components"},"envizi":{"name":"IBM Envizi","type":"components"},"redhat-enterprise-linux-ai":{"name":"RHEL AI","type":"components"},"docker":{"name":"Docker","type":"components"},"cloud-pak-for-applications":{"name":"IBM Cloud Pak for Applications","type":"components"},"containers":{"name":"Containers","type":"technologies"},"open-liberty":{"name":"Open Liberty","type":"components"},"conversation":{"name":"Conversational AI","type":"technologies"},"jakarta":{"name":"Jakarta EE","type":"components"},"granite-models":{"name":"Granite models","type":"components"},"watsonx":{"name":"watsonx","type":"components"},"section-item-articles":{"name":null,"type":"home-page-configuration"},"node-js":{"name":"Node.js","type":"languages"},"data-prep-kit":{"name":"Data Prep Kit (DPK)","type":"components"},"cloud-pak-for-data":{"name":"IBM Cloud Pak for Data","type":"components"},"ibm-linuxone":{"name":"IBM LinuxONE","type":"components"},"hybrid-cloud":{"name":"Hybrid Cloud","type":"depmodels"},"vision":{"name":"Computer vision","type":"technologies"},"hyper-protect":{"name":"IBM Cloud Hyper Protect Services","type":"components"},"section-item-messaging":{"name":null,"type":"home-page-configuration"},"section-item-security":{"name":null,"type":"home-page-configuration"},"section-item-learning-paths":{"name":null,"type":"home-page-configuration"},"section-item-genai-demo-hub":{"name":null,"type":"home-page-configuration"},"section-item-watsonx-ai-trial":{"name":null,"type":"home-page-configuration"},"section-item-ibm-cloud-catalog":{"name":null,"type":"home-page-configuration"},"section-item-ibm-granite-code":{"name":null,"type":"home-page-configuration"},"section-item-redhat-developer":{"name":null,"type":"home-page-configuration"},"maximo-visual-inspection":{"name":"IBM Maximo Visual Inspection","type":"components"},"section-item-hashicorp-developer":{"name":null,"type":"home-page-configuration"},"section-item-call-for-code":{"name":null,"type":"home-page-configuration"},"section-item-ibm-tech-exchange-community":{"name":null,"type":"home-page-configuration"},"home-page-hero":{"name":null,"type":"home-page-configuration"},"section-item-open-source":{"name":null,"type":"home-page-configuration"},"section-item-guided-projects":{"name":null,"type":"home-page-configuration"},"section-item-ibm-developer-newsletter":{"name":null,"type":"home-page-configuration"},"android":{"name":"Android","type":"components"},"section-explore":{"name":null,"type":"home-page-configuration"},"section-code":{"name":null,"type":"home-page-configuration"},"section-item-hackathons":{"name":null,"type":"home-page-configuration"},"section-item-watsonx-dev-hub":{"name":null,"type":"home-page-configuration"},"cloud-native-apps":{"name":"Cloud-native app development","type":"devpractices"},"data":{"name":"Data","type":"technologies"},"elyra":{"name":"Elyra","type":"components"},"aix":{"name":"IBM AIX","type":"components"},"ibm-mq":{"name":"IBM MQ","type":"components"},"api":{"name":"API management","type":"devpractices"},"application-modernization":{"name":"Application modernization","type":"devpractices"},"databases":{"name":"Databases","type":"technologies"},"data-management":{"name":"Data management","type":"technologies"},"data-stores":{"name":"Data storage","type":"technologies"},"aws":{"name":"Amazon Web Services (AWS)","type":"components"},"data-science":{"name":"Data science","type":"technologies"},"cloud-ibm":{"name":"IBM Cloud","type":"components"},"cloud-code-engine":{"name":"IBM Cloud Code Engine","type":"components"},"ibm-power":{"name":"IBM Power","type":"components"},"semeru-runtimes":{"name":"IBM Semeru Runtimes","type":"languages"},"security-verify":{"name":"IBM Security Verify","type":"components"},"cloud-pak-for-watson-aiops":{"name":"IBM Cloud Pak for AIOps","type":"components"},"cloud-pak-for-security":{"name":"IBM Cloud Pak for Security","type":"components"},"home-page-hero-carousel-item-2":{"name":null,"type":"home-page-configuration"},"home-page-hero-carousel-item-3":{"name":null,"type":"home-page-configuration"},"section-learn":{"name":null,"type":"home-page-configuration"},"section-engage":{"name":null,"type":"home-page-configuration"},"section-item-ai":{"name":null,"type":"home-page-configuration"},"section-item-data":{"name":null,"type":"home-page-configuration"},"section-item-cloud-native":{"name":null,"type":"home-page-configuration"},"section-item-observability":{"name":null,"type":"home-page-configuration"},"section-item-instructlab":{"name":null,"type":"home-page-configuration"},"section-item-trials":{"name":null,"type":"home-page-configuration"},"section-item-api-hub":{"name":null,"type":"home-page-configuration"},"section-item-python":{"name":null,"type":"home-page-configuration"},"section-item-open-liberty":{"name":null,"type":"home-page-configuration"},"section-item-nodejs":{"name":null,"type":"home-page-configuration"},"section-item-pytorch":{"name":null,"type":"home-page-configuration"},"section-item-cobol":{"name":null,"type":"home-page-configuration"},"devpractices":{"name":"Development practices","type":"tagType"},"languages":{"name":"Languages, frameworks, and runtimes","type":"tagType"},"home-page-events":{"name":null,"type":"home-page-configuration"},"components":{"name":"Products and services","type":"tagType"},"technologies":{"name":"Technologies","type":"tagType"},"section-item-java":{"name":null,"type":"home-page-configuration"},"home-page-hero-carousel-item-1":{"name":null,"type":"home-page-configuration"},"section-build":{"name":null,"type":"home-page-configuration"},"sterling":{"name":"IBM Sterling","type":"components"},"storage":{"name":"Storage","type":"technologies"},"cloud":{"name":"Cloud","type":"depmodels"},"guardium":{"name":"IBM Guardium","type":"components"},"ibm-i":{"name":"IBM i","type":"components"},"aiops":{"name":"AIOps","type":"devpractices"},"analytics":{"name":"Analytics","type":"technologies"},"spss-modeler":{"name":"IBM SPSS Modeler","type":"components"},"kafka":{"name":"Apache Kafka","type":"components"},"jsphere":{"name":"IBM JSphere Suite for Java","type":"components"},"turbonomic":{"name":"IBM Turbonomic","type":"components"},"istio":{"name":"Istio","type":"components"},"infrastructure":{"name":"IT Infrastructure","type":"technologies"},"java":{"name":"Java","type":"languages"},"java-platform":{"name":"Java Platform","type":"components"},"automation":{"name":"Automation","type":"technologies"},"iot":{"name":"IoT","type":"technologies"},"jupyter":{"name":"Jupyter","type":"components"},"knative":{"name":"Knative","type":"components"},"kserve":{"name":"KServe","type":"components"},"kubeflow":{"name":"Kubeflow","type":"components"},"kubernetes":{"name":"Kubernetes","type":"components"},"javascript":{"name":"JavaScript","type":"languages"},"linux":{"name":"Linux","type":"technologies"},"linux-on-ibm-power":{"name":"Linux on IBM Power","type":"components"},"machine-learning":{"name":"Machine Learning","type":"technologies"},"netezza-performance-server":{"name":"Netezza Performance Server","type":"components"},"cobol":{"name":"COBOL","type":"languages"},"large-language-models":{"name":"Large language models (LLMs)","type":"technologies"},"mainframe":{"name":"Mainframes","type":"technologies"},"messaging":{"name":"Messaging","type":"technologies"},"microprofile":{"name":"MicroProfile","type":"components"},"microservices":{"name":"Microservices","type":"depmodels"},"observability":{"name":"Observability","type":"devpractices"},"serverless":{"name":"Serverless","type":"depmodels"},"generative-ai":{"name":"Generative AI","type":"technologies"},"mobile":{"name":"Mobile development","type":"technologies"},"mqtt":{"name":"MQTT","type":"components"},"multicloud-development":{"name":"Multicloud development","type":"devpractices"},"node-red":{"name":"Node-RED","type":"components"},"geolocation":{"name":"Geolocation","type":"technologies"},"opensource-ai":{"name":"Open Source AI","type":"technologies"},"open-source-development":{"name":"Open-source development","type":"devpractices"},"python":{"name":"Python","type":"languages"},"pytorch":{"name":"PyTorch","type":"components"},"quarkus":{"name":"Quarkus","type":"components"},"redhat-enterprise-linux":{"name":"RHEL","type":"components"},"redhat-openshift":{"name":"Red Hat OpenShift","type":"components"},"redhat-openshift-ibm-cloud":{"name":"Red Hat OpenShift on IBM Cloud","type":"components"},"redhat-openshift-ai":{"name":"Red Hat OpenShift AI","type":"components"},"security":{"name":"Security","type":"devpractices"},"software-development":{"name":"Software development","type":"devpractices"},"agentic-ai":{"name":"Agentic AI","type":"technologies"},"tekton":{"name":"Tekton","type":"components"},"cloud-pak-for-automation":{"name":"IBM Cloud Pak for Business Automation","type":"components"},"speech-and-empathy":{"name":"Speech and Empathy","type":"technologies"},"tensorflow":{"name":"TensorFlow","type":"components"},"spring":{"name":"Spring","type":"components"},"terraform":{"name":"Terraform","type":"components"},"watsonx-governance":{"name":"watsonx.governance","type":"components"},"watson-apis":{"name":"Watson APIs","type":"components"},"watson-assistant":{"name":"watsonx Assistant","type":"components"},"watson-discovery":{"name":"Watson Discovery","type":"components"},"web-development":{"name":"Web development","type":"technologies"},"zero-trust":{"name":"Zero trust","type":"technologies"},"ibm-db2-database":{"name":"IBM Db2","type":"components"},"watsonx-orchestrate":{"name":"watsonx Orchestrate","type":"components"},"watson-studio":{"name":"Watson Studio","type":"components"},"ibm-quantum-safe":{"name":"IBM Quantum Safe","type":"components"}}}]}]}]