avformat/mov: don't abort on duplicate Mastering Display Metadata boxes

The VP9 spec defines a SmDm box for this information, and the ISOBMFF spec defines a mdvc one. If both are present, just ignore one of them. This is in line with clli and CoLL boxes. Fixes ticket #10711. Signed-off-by: James Almer <jamrial@gmail.com>
Revert "avcodec/hevc_ps: allocate only the required HEVCHdrParams within a VPS"
2024-04-05 22:06:19 +02:00 · 2024-04-05 22:06:19 +02:00 · 2024-04-05 22:06:19 +02:00 · 2024-04-05 22:06:19 +02:00 · 2024-04-05 22:06:19 +02:00 · 2024-04-05 22:06:19 +02:00
95 changed files with 5750 additions and 536 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,3 @@
+IndentWidth: 4
+UseTab: Never
+DisableFormat: true
--- a/compat/windows/dxva_av1.h
+++ b/compat/windows/dxva_av1.h
@ -0,0 +1,289 @@
+//------------------------------------------------------------------------------
+// File: DXVA.h
+//
+// Desc: DirectX Video Acceleration header file.
+//
+// Copyright (c) 1999 - 2002, Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+#ifndef _DIRECTX_AV1_VA_
+#define _DIRECTX_AV1_VA_
+
+#pragma pack(push, 1)
+
+/* AV1 picture entry data structure */
+typedef struct _DXVA_PicEntry_AV1 {
+
+    UINT width;
+    UINT height;
+
+    // Global motion parameters
+    INT wmmat[6];
+    union {
+        struct {
+            UCHAR wminvalid : 1;
+            UCHAR wmtype : 2;
+            UCHAR Reserved : 5;
+        };
+        UCHAR GlobalMotionFlags;
+    } DUMMYUNIONNAME;
+
+    UCHAR Index;
+    UINT16 Reserved16Bits;
+
+} DXVA_PicEntry_AV1, *LPDXVA_PicEntry_AV1;
+
+/* AV1 picture parameters structure */
+typedef struct _DXVA_PicParams_AV1 {
+    UINT width;
+    UINT height;
+
+    UINT max_width;
+    UINT max_height;
+
+    UCHAR CurrPicTextureIndex;
+    UCHAR superres_denom;
+    UCHAR bitdepth;
+    UCHAR seq_profile;
+
+    // Tiles:
+    struct {
+        UCHAR cols;
+        UCHAR rows;
+        USHORT context_update_id;
+        USHORT widths[64];
+        USHORT heights[64];
+    } tiles;
+
+    // Coding Tools
+    union {
+        struct {
+            UINT use_128x128_superblock : 1;
+            UINT intra_edge_filter : 1;
+            UINT interintra_compound : 1;
+            UINT masked_compound : 1;
+            UINT warped_motion : 1;
+            UINT dual_filter : 1;
+            UINT jnt_comp : 1;
+            UINT screen_content_tools : 1;
+            UINT integer_mv : 1;
+            UINT cdef : 1;
+            UINT restoration : 1;
+            UINT film_grain : 1;
+            UINT intrabc : 1;
+            UINT high_precision_mv : 1;
+            UINT switchable_motion_mode : 1;
+            UINT filter_intra : 1;
+            UINT disable_frame_end_update_cdf : 1;
+            UINT disable_cdf_update : 1;
+            UINT reference_mode : 1;
+            UINT skip_mode : 1;
+            UINT reduced_tx_set : 1;
+            UINT superres : 1;
+            UINT tx_mode : 2;
+            UINT use_ref_frame_mvs : 1;
+            UINT enable_ref_frame_mvs : 1;
+            UINT reference_frame_update : 1;
+            UINT Reserved : 5;
+        };
+        UINT32 CodingParamToolFlags;
+    } coding;
+
+    // Format & Picture Info flags
+    union {
+        struct {
+            UCHAR frame_type : 2;
+            UCHAR show_frame : 1;
+            UCHAR showable_frame : 1;
+            UCHAR subsampling_x : 1;
+            UCHAR subsampling_y : 1;
+            UCHAR mono_chrome : 1;
+            UCHAR Reserved : 1;
+        };
+        UCHAR FormatAndPictureInfoFlags;
+    } format;
+
+    // References
+    UCHAR primary_ref_frame;
+    UCHAR order_hint;
+    UCHAR order_hint_bits;
+
+    DXVA_PicEntry_AV1 frame_refs[7];
+    UCHAR RefFrameMapTextureIndex[8];
+
+    // Loop filter parameters
+    struct {
+        UCHAR filter_level[2];
+        UCHAR filter_level_u;
+        UCHAR filter_level_v;
+
+        UCHAR sharpness_level;
+        union {
+            struct {
+                UCHAR mode_ref_delta_enabled : 1;
+                UCHAR mode_ref_delta_update : 1;
+                UCHAR delta_lf_multi : 1;
+                UCHAR delta_lf_present : 1;
+                UCHAR Reserved : 4;
+            };
+            UCHAR ControlFlags;
+        } DUMMYUNIONNAME;
+        CHAR ref_deltas[8];
+        CHAR mode_deltas[2];
+        UCHAR delta_lf_res;
+        UCHAR frame_restoration_type[3];
+        USHORT log2_restoration_unit_size[3];
+        UINT16 Reserved16Bits;
+    } loop_filter;
+
+    // Quantization
+    struct {
+        union {
+            struct {
+                UCHAR delta_q_present : 1;
+                UCHAR delta_q_res : 2;
+                UCHAR Reserved : 5;
+            };
+            UCHAR ControlFlags;
+        } DUMMYUNIONNAME;
+
+        UCHAR base_qindex;
+        CHAR y_dc_delta_q;
+        CHAR u_dc_delta_q;
+        CHAR v_dc_delta_q;
+        CHAR u_ac_delta_q;
+        CHAR v_ac_delta_q;
+        // using_qmatrix:
+        UCHAR qm_y;
+        UCHAR qm_u;
+        UCHAR qm_v;
+        UINT16 Reserved16Bits;
+    } quantization;
+
+    // Cdef parameters
+    struct {
+        union {
+            struct {
+                UCHAR damping : 2;
+                UCHAR bits : 2;
+                UCHAR Reserved : 4;
+            };
+            UCHAR ControlFlags;
+        } DUMMYUNIONNAME;
+
+        union {
+            struct {
+                UCHAR primary : 6;
+                UCHAR secondary : 2;
+            };
+            UCHAR combined;
+        } y_strengths[8];
+
+        union {
+            struct {
+                UCHAR primary : 6;
+                UCHAR secondary : 2;
+            };
+            UCHAR combined;
+        } uv_strengths[8];
+
+    } cdef;
+
+    UCHAR interp_filter;
+
+    // Segmentation
+    struct {
+        union {
+            struct {
+                UCHAR enabled : 1;
+                UCHAR update_map : 1;
+                UCHAR update_data : 1;
+                UCHAR temporal_update : 1;
+                UCHAR Reserved : 4;
+            };
+            UCHAR ControlFlags;
+        } DUMMYUNIONNAME;
+        UCHAR Reserved24Bits[3];
+
+        union {
+            struct {
+                UCHAR alt_q : 1;
+                UCHAR alt_lf_y_v : 1;
+                UCHAR alt_lf_y_h : 1;
+                UCHAR alt_lf_u : 1;
+                UCHAR alt_lf_v : 1;
+                UCHAR ref_frame : 1;
+                UCHAR skip : 1;
+                UCHAR globalmv : 1;
+            };
+            UCHAR mask;
+        } feature_mask[8];
+
+        SHORT feature_data[8][8];
+
+    } segmentation;
+
+    struct {
+        union {
+            struct {
+                USHORT apply_grain : 1;
+                USHORT scaling_shift_minus8 : 2;
+                USHORT chroma_scaling_from_luma : 1;
+                USHORT ar_coeff_lag : 2;
+                USHORT ar_coeff_shift_minus6 : 2;
+                USHORT grain_scale_shift : 2;
+                USHORT overlap_flag : 1;
+                USHORT clip_to_restricted_range : 1;
+                USHORT matrix_coeff_is_identity : 1;
+                USHORT Reserved : 3;
+            };
+            USHORT ControlFlags;
+        } DUMMYUNIONNAME;
+
+        USHORT grain_seed;
+        UCHAR scaling_points_y[14][2];
+        UCHAR num_y_points;
+        UCHAR scaling_points_cb[10][2];
+        UCHAR num_cb_points;
+        UCHAR scaling_points_cr[10][2];
+        UCHAR num_cr_points;
+        UCHAR ar_coeffs_y[24];
+        UCHAR ar_coeffs_cb[25];
+        UCHAR ar_coeffs_cr[25];
+        UCHAR cb_mult;
+        UCHAR cb_luma_mult;
+        UCHAR cr_mult;
+        UCHAR cr_luma_mult;
+        UCHAR Reserved8Bits;
+        SHORT cb_offset;
+        SHORT cr_offset;
+    } film_grain;
+
+    UINT   Reserved32Bits;
+    UINT   StatusReportFeedbackNumber;
+} DXVA_PicParams_AV1, *LPDXVA_PicParams_AV1;
+
+/* AV1 tile structure */
+typedef struct _DXVA_Tile_AV1 {
+    UINT   DataOffset;
+    UINT   DataSize;
+    USHORT row;
+    USHORT column;
+    UINT16 Reserved16Bits;
+    UCHAR anchor_frame;
+    UCHAR Reserved8Bits;
+} DXVA_Tile_AV1, *LPDXVA_Tile_AV1;
+
+/* AV1 status reporting data structure */
+typedef struct _DXVA_Status_AV1 {
+    UINT  StatusReportFeedbackNumber;
+    DXVA_PicEntry_AV1 CurrPic;
+    UCHAR  BufType;
+    UCHAR  Status;
+    UCHAR  Reserved8Bits;
+    USHORT NumMbsAffected;
+} DXVA_Status_AV1, *LPDXVA_Status_AV1;
+
+#pragma pack(pop)
+
+#endif // _DIRECTX_AV1_VA_
--- a/compat/windows/dxva_hevc.h
+++ b/compat/windows/dxva_hevc.h
@ -0,0 +1,150 @@
+//------------------------------------------------------------------------------
+// Copyright (c) 1999 - 2002, Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+#ifndef __DIRECTX_VA_HEVC__
+#define __DIRECTX_VA_HEVC__
+
+#pragma pack(push, 1)
+
+/* HEVC Picture Entry structure */
+typedef struct _DXVA_PicEntry_HEVC
+{
+    union
+    {
+        struct
+        {
+            UCHAR Index7Bits : 7;
+            UCHAR AssociatedFlag : 1;
+        };
+        UCHAR bPicEntry;
+    };
+} DXVA_PicEntry_HEVC, *LPDXVA_PicEntry_HEVC;
+
+/* HEVC Picture Parameter structure */
+typedef struct _DXVA_PicParams_HEVC {
+    USHORT      PicWidthInMinCbsY;
+    USHORT      PicHeightInMinCbsY;
+    union {
+        struct {
+            USHORT  chroma_format_idc                       : 2;
+            USHORT  separate_colour_plane_flag              : 1;
+            USHORT  bit_depth_luma_minus8                   : 3;
+            USHORT  bit_depth_chroma_minus8                 : 3;
+            USHORT  log2_max_pic_order_cnt_lsb_minus4       : 4;
+            USHORT  NoPicReorderingFlag                     : 1;
+            USHORT  NoBiPredFlag                            : 1;
+            USHORT  ReservedBits1                            : 1;
+        };
+        USHORT wFormatAndSequenceInfoFlags;
+    };
+    DXVA_PicEntry_HEVC  CurrPic;
+    UCHAR   sps_max_dec_pic_buffering_minus1;
+    UCHAR   log2_min_luma_coding_block_size_minus3;
+    UCHAR   log2_diff_max_min_luma_coding_block_size;
+    UCHAR   log2_min_transform_block_size_minus2;
+    UCHAR   log2_diff_max_min_transform_block_size;
+    UCHAR   max_transform_hierarchy_depth_inter;
+    UCHAR   max_transform_hierarchy_depth_intra;
+    UCHAR   num_short_term_ref_pic_sets;
+    UCHAR   num_long_term_ref_pics_sps;
+    UCHAR   num_ref_idx_l0_default_active_minus1;
+    UCHAR   num_ref_idx_l1_default_active_minus1;
+    CHAR    init_qp_minus26;
+    UCHAR   ucNumDeltaPocsOfRefRpsIdx;
+    USHORT  wNumBitsForShortTermRPSInSlice;
+    USHORT  ReservedBits2;
+
+    union {
+        struct {
+            UINT32  scaling_list_enabled_flag                    : 1;
+            UINT32  amp_enabled_flag                            : 1;
+            UINT32  sample_adaptive_offset_enabled_flag         : 1;
+            UINT32  pcm_enabled_flag                            : 1;
+            UINT32  pcm_sample_bit_depth_luma_minus1            : 4;
+            UINT32  pcm_sample_bit_depth_chroma_minus1          : 4;
+            UINT32  log2_min_pcm_luma_coding_block_size_minus3  : 2;
+            UINT32  log2_diff_max_min_pcm_luma_coding_block_size : 2;
+            UINT32  pcm_loop_filter_disabled_flag                : 1;
+            UINT32  long_term_ref_pics_present_flag             : 1;
+            UINT32  sps_temporal_mvp_enabled_flag               : 1;
+            UINT32  strong_intra_smoothing_enabled_flag         : 1;
+            UINT32  dependent_slice_segments_enabled_flag       : 1;
+            UINT32  output_flag_present_flag                    : 1;
+            UINT32  num_extra_slice_header_bits                 : 3;
+            UINT32  sign_data_hiding_enabled_flag               : 1;
+            UINT32  cabac_init_present_flag                     : 1;
+            UINT32  ReservedBits3                               : 5;
+        };
+        UINT32 dwCodingParamToolFlags;
+    };
+
+    union {
+        struct {
+            UINT32  constrained_intra_pred_flag                 : 1;
+            UINT32  transform_skip_enabled_flag                 : 1;
+            UINT32  cu_qp_delta_enabled_flag                    : 1;
+            UINT32  pps_slice_chroma_qp_offsets_present_flag    : 1;
+            UINT32  weighted_pred_flag                          : 1;
+            UINT32  weighted_bipred_flag                        : 1;
+            UINT32  transquant_bypass_enabled_flag              : 1;
+            UINT32  tiles_enabled_flag                          : 1;
+            UINT32  entropy_coding_sync_enabled_flag            : 1;
+            UINT32  uniform_spacing_flag                        : 1;
+            UINT32  loop_filter_across_tiles_enabled_flag       : 1;
+            UINT32  pps_loop_filter_across_slices_enabled_flag  : 1;
+            UINT32  deblocking_filter_override_enabled_flag     : 1;
+            UINT32  pps_deblocking_filter_disabled_flag         : 1;
+            UINT32  lists_modification_present_flag             : 1;
+            UINT32  slice_segment_header_extension_present_flag : 1;
+            UINT32  IrapPicFlag                                 : 1;
+            UINT32  IdrPicFlag                                  : 1;
+            UINT32  IntraPicFlag                                : 1;
+            UINT32  ReservedBits4                               : 13;
+        };
+        UINT32 dwCodingSettingPicturePropertyFlags;
+    };
+    CHAR    pps_cb_qp_offset;
+    CHAR    pps_cr_qp_offset;
+    UCHAR   num_tile_columns_minus1;
+    UCHAR   num_tile_rows_minus1;
+    USHORT  column_width_minus1[19];
+    USHORT  row_height_minus1[21];
+    UCHAR   diff_cu_qp_delta_depth;
+    CHAR    pps_beta_offset_div2;
+    CHAR    pps_tc_offset_div2;
+    UCHAR   log2_parallel_merge_level_minus2;
+    INT     CurrPicOrderCntVal;
+    DXVA_PicEntry_HEVC	RefPicList[15];
+    UCHAR   ReservedBits5;
+    INT     PicOrderCntValList[15];
+    UCHAR   RefPicSetStCurrBefore[8];
+    UCHAR   RefPicSetStCurrAfter[8];
+    UCHAR   RefPicSetLtCurr[8];
+    USHORT  ReservedBits6;
+    USHORT  ReservedBits7;
+    UINT    StatusReportFeedbackNumber;
+} DXVA_PicParams_HEVC, *LPDXVA_PicParams_HEVC;
+
+/* HEVC Quantizatiuon Matrix structure */
+typedef struct _DXVA_Qmatrix_HEVC
+{
+    UCHAR ucScalingLists0[6][16];
+    UCHAR ucScalingLists1[6][64];
+    UCHAR ucScalingLists2[6][64];
+    UCHAR ucScalingLists3[2][64];
+    UCHAR ucScalingListDCCoefSizeID2[6];
+    UCHAR ucScalingListDCCoefSizeID3[2];
+} DXVA_Qmatrix_HEVC, *LPDXVA_Qmatrix_HEVC;
+
+
+/* HEVC Slice Control Structure */
+typedef struct _DXVA_Slice_HEVC_Short
+{
+    UINT    BSNALunitDataLocation;
+    UINT    SliceBytesInBuffer;
+    USHORT  wBadSliceChopping;
+} DXVA_Slice_HEVC_Short, *LPDXVA_Slice_HEVC_Short;
+
+#pragma pack(pop)
+#endif
--- a/compat/windows/dxva_vpx.h
+++ b/compat/windows/dxva_vpx.h
@ -0,0 +1,185 @@
+//------------------------------------------------------------------------------
+// Copyright (c) 1999 - 2002, Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+#ifndef __DIRECTX_VA_VPX__
+#define __DIRECTX_VA_VPX__
+
+#pragma pack(push, 1)
+
+/* VPx picture entry data structure */
+typedef struct _DXVA_PicEntry_VPx {
+    union {
+        struct {
+            UCHAR Index7Bits : 7;
+            UCHAR AssociatedFlag : 1;
+        };
+        UCHAR bPicEntry;
+    };
+} DXVA_PicEntry_VPx, *LPDXVA_PicEntry_VPx;
+
+/* VP9 segmentation structure */
+typedef struct _segmentation_VP9 {
+    union {
+        struct {
+            UCHAR enabled : 1;
+            UCHAR update_map : 1;
+            UCHAR temporal_update : 1;
+            UCHAR abs_delta : 1;
+            UCHAR ReservedSegmentFlags4Bits : 4;
+        };
+        UCHAR wSegmentInfoFlags;
+    };
+    UCHAR tree_probs[7];
+    UCHAR pred_probs[3];
+    SHORT feature_data[8][4];
+    UCHAR feature_mask[8];
+} DXVA_segmentation_VP9;
+
+/* VP9 picture parameters structure */
+typedef struct _DXVA_PicParams_VP9 {
+    DXVA_PicEntry_VPx    CurrPic;
+    UCHAR                profile;
+    union {
+        struct {
+            USHORT frame_type : 1;
+            USHORT show_frame : 1;
+            USHORT error_resilient_mode : 1;
+            USHORT subsampling_x : 1;
+            USHORT subsampling_y : 1;
+            USHORT extra_plane : 1;
+            USHORT refresh_frame_context : 1;
+            USHORT frame_parallel_decoding_mode : 1;
+            USHORT intra_only : 1;
+            USHORT frame_context_idx : 2;
+            USHORT reset_frame_context : 2;
+            USHORT allow_high_precision_mv : 1;
+            USHORT ReservedFormatInfo2Bits : 2;
+        };
+        USHORT wFormatAndPictureInfoFlags;
+    };
+    UINT  width;
+    UINT  height;
+    UCHAR BitDepthMinus8Luma;
+    UCHAR BitDepthMinus8Chroma;
+    UCHAR interp_filter;
+    UCHAR Reserved8Bits;
+    DXVA_PicEntry_VPx  ref_frame_map[8];
+    UINT  ref_frame_coded_width[8];
+    UINT  ref_frame_coded_height[8];
+    DXVA_PicEntry_VPx  frame_refs[3];
+    CHAR  ref_frame_sign_bias[4];
+    CHAR  filter_level;
+    CHAR  sharpness_level;
+    union {
+        struct {
+            UCHAR mode_ref_delta_enabled : 1;
+            UCHAR mode_ref_delta_update : 1;
+            UCHAR use_prev_in_find_mv_refs : 1;
+            UCHAR ReservedControlInfo5Bits : 5;
+        };
+        UCHAR wControlInfoFlags;
+    };
+    CHAR   ref_deltas[4];
+    CHAR   mode_deltas[2];
+    SHORT  base_qindex;
+    CHAR   y_dc_delta_q;
+    CHAR   uv_dc_delta_q;
+    CHAR   uv_ac_delta_q;
+    DXVA_segmentation_VP9 stVP9Segments;
+    UCHAR  log2_tile_cols;
+    UCHAR  log2_tile_rows;
+    USHORT uncompressed_header_size_byte_aligned;
+    USHORT first_partition_size;
+    USHORT Reserved16Bits;
+    UINT   Reserved32Bits;
+    UINT   StatusReportFeedbackNumber;
+} DXVA_PicParams_VP9, *LPDXVA_PicParams_VP9;
+
+/* VP8 segmentation structure */
+typedef struct _segmentation_VP8 {
+    union {
+        struct {
+            UCHAR segmentation_enabled : 1;
+            UCHAR update_mb_segmentation_map : 1;
+            UCHAR update_mb_segmentation_data : 1;
+            UCHAR mb_segement_abs_delta : 1;
+            UCHAR ReservedSegmentFlags4Bits : 4;
+        };
+        UCHAR wSegmentFlags;
+    };
+    CHAR  segment_feature_data[2][4];
+    UCHAR mb_segment_tree_probs[3];
+} DXVA_segmentation_VP8;
+
+/* VP8 picture parameters structure */
+typedef struct _DXVA_PicParams_VP8 {
+    UINT first_part_size;
+    UINT width;
+    UINT height;
+    DXVA_PicEntry_VPx  CurrPic;
+    union {
+        struct {
+            UCHAR frame_type : 1;
+            UCHAR version : 3;
+            UCHAR show_frame : 1;
+            UCHAR clamp_type : 1;
+            UCHAR ReservedFrameTag3Bits : 2;
+        };
+        UCHAR wFrameTagFlags;
+    };
+    DXVA_segmentation_VP8  stVP8Segments;
+    UCHAR filter_type;
+    UCHAR filter_level;
+    UCHAR sharpness_level;
+    UCHAR mode_ref_lf_delta_enabled;
+    UCHAR mode_ref_lf_delta_update;
+    CHAR  ref_lf_deltas[4];
+    CHAR  mode_lf_deltas[4];
+    UCHAR log2_nbr_of_dct_partitions;
+    UCHAR base_qindex;
+    CHAR  y1dc_delta_q;
+    CHAR  y2dc_delta_q;
+    CHAR  y2ac_delta_q;
+    CHAR  uvdc_delta_q;
+    CHAR  uvac_delta_q;
+    DXVA_PicEntry_VPx alt_fb_idx;
+    DXVA_PicEntry_VPx gld_fb_idx;
+    DXVA_PicEntry_VPx lst_fb_idx;
+    UCHAR  ref_frame_sign_bias_golden;
+    UCHAR  ref_frame_sign_bias_altref;
+    UCHAR  refresh_entropy_probs;
+    UCHAR  vp8_coef_update_probs[4][8][3][11];
+    UCHAR  mb_no_coeff_skip;
+    UCHAR  prob_skip_false;
+    UCHAR  prob_intra;
+    UCHAR  prob_last;
+    UCHAR  prob_golden;
+    UCHAR  intra_16x16_prob[4];
+    UCHAR  intra_chroma_prob[3];
+    UCHAR  vp8_mv_update_probs[2][19];
+    USHORT ReservedBits1;
+    USHORT ReservedBits2;
+    USHORT ReservedBits3;
+    UINT   StatusReportFeedbackNumber;
+} DXVA_PicParams_VP8, *LPDXVA_PicParams_VP8;
+
+/* VPx slice control data structure - short form */
+typedef struct _DXVA_Slice_VPx_Short {
+    UINT   BSNALunitDataLocation;
+    UINT   SliceBytesInBuffer;
+    USHORT wBadSliceChopping;
+} DXVA_Slice_VPx_Short, *LPDXVA_Slice_VPx_Short;
+
+/* VPx status reporting data structure */
+typedef struct _DXVA_Status_VPx {
+    UINT  StatusReportFeedbackNumber;
+    DXVA_PicEntry_VPx CurrPic;
+    UCHAR  bBufType;
+    UCHAR  bStatus;
+    UCHAR  bReserved8Bits;
+    USHORT wNumMbsAffected;
+} DXVA_Status_VPx, *LPDXVA_Status_VPx;
+
+#pragma pack(pop)
+#endif
--- a/29
+++ b/29
@ -2455,6 +2455,9 @@ TOOLCHAIN_FEATURES="
 TYPES_LIST="
    DPI_AWARENESS_CONTEXT
    IDXGIOutput5
+    DXVA_PicParams_AV1
+    DXVA_PicParams_HEVC
+    DXVA_PicParams_VP9
    kCMVideoCodecType_HEVC
    kCMVideoCodecType_HEVCWithAlpha
    kCMVideoCodecType_VP9
@ -3134,13 +3137,13 @@ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
 vulkan_deps="threads"
 vulkan_deps_any="libdl LoadLibrary"

-av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
+av1_d3d11va_hwaccel_deps="d3d11va"
 av1_d3d11va_hwaccel_select="av1_decoder"
-av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
+av1_d3d11va2_hwaccel_deps="d3d11va"
 av1_d3d11va2_hwaccel_select="av1_decoder"
-av1_d3d12va_hwaccel_deps="d3d12va DXVA_PicParams_AV1"
+av1_d3d12va_hwaccel_deps="d3d12va"
 av1_d3d12va_hwaccel_select="av1_decoder"
-av1_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_AV1"
+av1_dxva2_hwaccel_deps="dxva2"
 av1_dxva2_hwaccel_select="av1_decoder"
 av1_nvdec_hwaccel_deps="nvdec CUVIDAV1PICPARAMS"
 av1_nvdec_hwaccel_select="av1_decoder"
@ -3172,13 +3175,13 @@ h264_videotoolbox_hwaccel_deps="videotoolbox"
 h264_videotoolbox_hwaccel_select="h264_decoder"
 h264_vulkan_hwaccel_deps="vulkan"
 h264_vulkan_hwaccel_select="h264_decoder"
-hevc_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_HEVC"
+hevc_d3d11va_hwaccel_deps="d3d11va"
 hevc_d3d11va_hwaccel_select="hevc_decoder"
-hevc_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_HEVC"
+hevc_d3d11va2_hwaccel_deps="d3d11va"
 hevc_d3d11va2_hwaccel_select="hevc_decoder"
-hevc_d3d12va_hwaccel_deps="d3d12va DXVA_PicParams_HEVC"
+hevc_d3d12va_hwaccel_deps="d3d12va"
 hevc_d3d12va_hwaccel_select="hevc_decoder"
-hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
+hevc_dxva2_hwaccel_deps="dxva2"
 hevc_dxva2_hwaccel_select="hevc_decoder"
 hevc_nvdec_hwaccel_deps="nvdec"
 hevc_nvdec_hwaccel_select="hevc_decoder"
@ -3244,13 +3247,13 @@ vp8_nvdec_hwaccel_deps="nvdec"
 vp8_nvdec_hwaccel_select="vp8_decoder"
 vp8_vaapi_hwaccel_deps="vaapi"
 vp8_vaapi_hwaccel_select="vp8_decoder"
-vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
+vp9_d3d11va_hwaccel_deps="d3d11va"
 vp9_d3d11va_hwaccel_select="vp9_decoder"
-vp9_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
+vp9_d3d11va2_hwaccel_deps="d3d11va"
 vp9_d3d11va2_hwaccel_select="vp9_decoder"
-vp9_d3d12va_hwaccel_deps="d3d12va DXVA_PicParams_VP9"
+vp9_d3d12va_hwaccel_deps="d3d12va"
 vp9_d3d12va_hwaccel_select="vp9_decoder"
-vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9"
+vp9_dxva2_hwaccel_deps="dxva2"
 vp9_dxva2_hwaccel_select="vp9_decoder"
 vp9_nvdec_hwaccel_deps="nvdec"
 vp9_nvdec_hwaccel_select="vp9_decoder"
@ -6939,7 +6942,7 @@ enabled libsmbclient      && { check_pkg_config libsmbclient smbclient libsmbcli
 enabled libsnappy         && require libsnappy snappy-c.h snappy_compress -lsnappy -lstdc++
 enabled libsoxr           && require libsoxr soxr.h soxr_create -lsoxr
 enabled libssh            && require_pkg_config libssh "libssh >= 0.6.0" libssh/sftp.h sftp_init
-enabled libspeex          && require_pkg_config libspeex speex speex/speex.h speex_decoder_init
+enabled libspeex          && require libspeex speex/speex.h speex_decoder_init -lspeex
 enabled libsrt            && require_pkg_config libsrt "srt >= 1.3.0" srt/srt.h srt_socket
 enabled libsvtav1         && require_pkg_config libsvtav1 "SvtAv1Enc >= 0.9.0" EbSvtAv1Enc.h svt_av1_enc_init_handle
 enabled libtensorflow     && require libtensorflow tensorflow/c/c_api.h TF_Version -ltensorflow
--- a/ffbuild/.gitignore
+++ b/ffbuild/.gitignore
@ -5,3 +5,4 @@
 /config.log
 /config.mak
 /config.sh
+/config.out
--- a/libavcodec/aacdec_common.c
+++ b/libavcodec/aacdec_common.c
@ -43,7 +43,7 @@ const uint8_t ff_aac_channel_layout_map[16][16][3] = {
    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_SCE, 1, AAC_CHANNEL_BACK }, },
    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 1, AAC_CHANNEL_BACK }, },
    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 1, AAC_CHANNEL_BACK }, { TYPE_LFE, 0, AAC_CHANNEL_LFE  }, },
-    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 1, AAC_CHANNEL_FRONT }, { TYPE_CPE, 2, AAC_CHANNEL_BACK }, { TYPE_LFE, 0, AAC_CHANNEL_LFE  }, },
+    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 1, AAC_CHANNEL_SIDE }, { TYPE_CPE, 2, AAC_CHANNEL_BACK }, { TYPE_LFE, 0, AAC_CHANNEL_LFE  }, },
    { { 0, } },
    { { 0, } },
    { { 0, } },
--- a/libavcodec/aacdec_template.c
+++ b/libavcodec/aacdec_template.c
@ -577,7 +577,7 @@ static ChannelElement *get_che(AACDecContext *ac, int type, int elem_id)
 {
    /* For PCE based channel configurations map the channels solely based
     * on tags. */
-    if (!ac->oc[1].m4ac.chan_config) {
+    if (!ac->oc[1].m4ac.chan_config || ac->oc[1].m4ac.pce) {
        return ac->tag_che_map[type][elem_id];
    }
    // Allow single CPE stereo files to be signalled with mono configuration.
@ -3219,7 +3219,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, AVFrame *frame,
            } else {
                err = output_configure(ac, layout_map, tags, OC_TRIAL_PCE, 1);
                if (!err)
-                    ac->oc[1].m4ac.chan_config = 0;
+                    ac->oc[1].m4ac.pce = 1;
                pce_found = 1;
            }
            break;
--- a/libavcodec/av1dec.c
+++ b/libavcodec/av1dec.c
@ -1150,7 +1150,7 @@ static int set_output_frame(AVCodecContext *avctx, AVFrame *frame)
    // TODO: all layers
    if (s->operating_point_idc &&
        av_log2(s->operating_point_idc >> 8) > s->cur_frame.spatial_id)
-        return 0;
+        return AVERROR(EAGAIN);

    ret = av_frame_ref(frame, srcframe);
    if (ret < 0)
@ -1345,7 +1345,7 @@ static int av1_receive_frame_internal(AVCodecContext *avctx, AVFrame *frame)

                if (s->cur_frame.f->buf[0]) {
                    ret = set_output_frame(avctx, frame);
-                    if (ret < 0)
+                    if (ret < 0 && ret != AVERROR(EAGAIN))
                        av_log(avctx, AV_LOG_ERROR, "Set output frame error.\n");
                }

@ -1457,11 +1457,13 @@ static int av1_receive_frame_internal(AVCodecContext *avctx, AVFrame *frame)

            if (s->raw_frame_header->show_frame && s->cur_frame.f->buf[0]) {
                ret = set_output_frame(avctx, frame);
-                if (ret < 0) {
+                if (ret < 0 && ret != AVERROR(EAGAIN)) {
                    av_log(avctx, AV_LOG_ERROR, "Set output frame error\n");
                    goto end;
                }
-            }
+            } else if (show_frame)
+                ret = AVERROR_INVALIDDATA;
+
            raw_tile_group = NULL;
            s->raw_frame_header = NULL;
            if (show_frame) {
--- a/libavcodec/avcodec.c
+++ b/libavcodec/avcodec.c
@ -247,8 +247,10 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
        && !(codec->capabilities & AV_CODEC_CAP_CHANNEL_CONF)) {
        av_log(avctx, AV_LOG_ERROR, "%s requires channel layout to be set\n",
               av_codec_is_decoder(codec) ? "Decoder" : "Encoder");
-        ret = AVERROR(EINVAL);
-        goto free_and_end;
+        if (!av_codec_is_decoder(codec)) {
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
    }
    if (avctx->ch_layout.nb_channels && !av_channel_layout_check(&avctx->ch_layout)) {
        av_log(avctx, AV_LOG_ERROR, "Invalid channel layout\n");
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@ -1665,6 +1665,8 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_DTS_ES             30
 #define FF_PROFILE_DTS_96_24          40
 #define FF_PROFILE_DTS_HD_HRA         50
+#define FF_PROFILE_DTS_HD_HRA_X       51
+#define FF_PROFILE_DTS_HD_HRA_X_IMAX  52
 #define FF_PROFILE_DTS_HD_MA          60
 #define FF_PROFILE_DTS_EXPRESS        70
 #define FF_PROFILE_DTS_HD_MA_X        61
@ -1696,11 +1698,13 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_H264_HIGH_422             122
 #define FF_PROFILE_H264_HIGH_422_INTRA       (122|FF_PROFILE_H264_INTRA)
 #define FF_PROFILE_H264_STEREO_HIGH          128
+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
 #define FF_PROFILE_H264_HIGH_444             144
 #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
 #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
 #define FF_PROFILE_H264_CAVLC_444            44

+
 #define FF_PROFILE_VC1_SIMPLE   0
 #define FF_PROFILE_VC1_MAIN     1
 #define FF_PROFILE_VC1_COMPLEX  2
@ -2029,6 +2033,13 @@ typedef struct AVCodecContext {
     */
    int64_t frame_num;

+    /**
+     * Is the stream completely progressive?
+     * - decoding: set by avcodec
+     * - encoding: unused
+     */
+    int progressive_sequence;
+
    /**
     * Decoding only. May be set by the caller before avcodec_open2() to an
     * av_malloc()'ed array (or via AVOptions). Owned and freed by the decoder
@ -2224,6 +2235,12 @@ typedef struct AVSubtitleRect {
    char *ass;
 } AVSubtitleRect;

+typedef struct AVSubtitleDVDPalette {
+    uint32_t start_display_time;
+    uint8_t colormap[4];
+    uint8_t alpha[4];
+} AVSubtitleDVDPalette;
+
 typedef struct AVSubtitle {
    uint16_t format; /* 0 = graphics */
    uint32_t start_display_time; /* relative to packet pts, in ms */
@ -2231,6 +2248,9 @@ typedef struct AVSubtitle {
    unsigned num_rects;
    AVSubtitleRect **rects;
    int64_t pts;    ///< Same as packet pts, in AV_TIME_BASE
+
+    unsigned num_dvd_palette;
+    AVSubtitleDVDPalette **dvd_palette;
 } AVSubtitle;

 /**
--- a/libavcodec/bsf/extract_extradata.c
+++ b/libavcodec/bsf/extract_extradata.c
@ -166,10 +166,10 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
        VVC_VPS_NUT, VVC_SPS_NUT, VVC_PPS_NUT,
    };
    static const int extradata_nal_types_hevc[] = {
-        HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS,
+        HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS, HEVC_NAL_SEI_PREFIX, HEVC_NAL_SEI_SUFFIX,
    };
    static const int extradata_nal_types_h264[] = {
-        H264_NAL_SPS, H264_NAL_PPS,
+        H264_NAL_SPS, H264_NAL_SUB_SPS, H264_NAL_PPS, H264_NAL_SEI,
    };

    ExtractExtradataContext *s = ctx->priv_data;
@ -206,7 +206,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
                if (nal->type == HEVC_NAL_SPS) has_sps = 1;
                if (nal->type == HEVC_NAL_VPS) has_vps = 1;
            } else {
-                if (nal->type == H264_NAL_SPS) has_sps = 1;
+                if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SUB_SPS) has_sps = 1;
            }
        } else if (s->remove) {
            filtered_size += nal->raw_size + 3;
@ -216,7 +216,8 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
    if (extradata_size &&
        ((ctx->par_in->codec_id == AV_CODEC_ID_VVC  && has_sps) ||
         (ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) ||
-         (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) {
+         (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps) ||
+         (ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC && has_sps))) {
        AVBufferRef *filtered_buf = NULL;
        PutByteContext pb_filtered_data, pb_extradata;
        uint8_t *extradata;
@ -368,6 +369,7 @@ static const struct {
    { AV_CODEC_ID_AVS3,       extract_extradata_mpeg4   },
    { AV_CODEC_ID_CAVS,       extract_extradata_mpeg4   },
    { AV_CODEC_ID_H264,       extract_extradata_h2645   },
+    { AV_CODEC_ID_H264_MVC,   extract_extradata_h2645   },
    { AV_CODEC_ID_HEVC,       extract_extradata_h2645   },
    { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12  },
    { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12  },
@ -438,6 +440,7 @@ static const enum AVCodecID codec_ids[] = {
    AV_CODEC_ID_AVS3,
    AV_CODEC_ID_CAVS,
    AV_CODEC_ID_H264,
+    AV_CODEC_ID_H264_MVC,
    AV_CODEC_ID_HEVC,
    AV_CODEC_ID_MPEG1VIDEO,
    AV_CODEC_ID_MPEG2VIDEO,
--- a/libavcodec/bsf/remove_extradata.c
+++ b/libavcodec/bsf/remove_extradata.c
@ -76,7 +76,7 @@ static int h264_split(const uint8_t *buf, int buf_size)
        if ((state & 0xFFFFFF00) != 0x100)
            break;
        nalu_type = state & 0x1F;
-        if (nalu_type == H264_NAL_SPS) {
+        if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SUB_SPS) {
            has_sps = 1;
        } else if (nalu_type == H264_NAL_PPS)
            has_pps = 1;
@ -204,6 +204,7 @@ static int remove_extradata(AVBSFContext *ctx, AVPacket *pkt)
            i = mpeg4video_split(pkt->data, pkt->size);
            break;
        case AV_CODEC_ID_H264:
+        case AV_CODEC_ID_H264_MVC:
            i = h264_split(pkt->data, pkt->size);
            break;
        case AV_CODEC_ID_HEVC:
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@ -1959,6 +1959,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
    },

+    {
+        .id        = AV_CODEC_ID_H264_MVC,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "h264_mvc",
+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+
    /* various PCM "codecs" */
    {
        .id        = AV_CODEC_ID_PCM_S16LE,
--- a/libavcodec/codec_id.h
+++ b/libavcodec/codec_id.h
@ -323,6 +323,8 @@ enum AVCodecID {
    AV_CODEC_ID_VMIX,
    AV_CODEC_ID_LEAD,

+    AV_CODEC_ID_H264_MVC,
+
    /* various PCM "codecs" */
    AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
    AV_CODEC_ID_PCM_S16LE = 0x10000,
--- a/libavcodec/dca_core.c
+++ b/libavcodec/dca_core.c
@ -2369,9 +2369,14 @@ int ff_dca_core_filter_frame(DCACoreDecoder *s, AVFrame *frame)
        return ret;

    // Set profile, bit rate, etc
-    if (s->ext_audio_mask & DCA_EXSS_MASK)
-        avctx->profile = AV_PROFILE_DTS_HD_HRA;
-    else if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH))
+    if (s->ext_audio_mask & DCA_EXSS_MASK) {
+        if (dca->exss.x_syncword_present)
+            avctx->profile = FF_PROFILE_DTS_HD_HRA_X;
+        else if (dca->exss.x_imax_syncword_present)
+            avctx->profile = FF_PROFILE_DTS_HD_HRA_X_IMAX;
+        else
+            avctx->profile = AV_PROFILE_DTS_HD_HRA;
+    } else if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH))
        avctx->profile = AV_PROFILE_DTS_ES;
    else if (s->ext_audio_mask & DCA_CSS_X96)
        avctx->profile = AV_PROFILE_DTS_96_24;
--- a/libavcodec/dca_exss.c
+++ b/libavcodec/dca_exss.c
@ -19,6 +19,7 @@
 */

 #include "dcadec.h"
+#include "dca_syncwords.h"

 static void parse_xll_parameters(DCAExssParser *s, DCAExssAsset *asset)
 {
@ -510,5 +511,17 @@ int ff_dca_exss_parse(DCAExssParser *s, const uint8_t *data, int size)
        return AVERROR_INVALIDDATA;
    }

+    // Check for extradata extensions
+    if ((s->exss_size - offset) > 10) {
+        if (AV_RB32(data + offset) == 0x3a429b0a) {
+            unsigned int extradata_syncword = AV_RB32(data + offset + 6);
+            if (extradata_syncword == DCA_SYNCWORD_XLL_X) {
+                s->x_syncword_present = 1;
+            } else if ((extradata_syncword >> 1) == (DCA_SYNCWORD_XLL_X_IMAX >> 1)) {
+                s->x_imax_syncword_present = 1;
+            }
+        }
+    }
+
    return 0;
 }
--- a/libavcodec/dca_exss.h
+++ b/libavcodec/dca_exss.h
@ -84,6 +84,9 @@ typedef struct DCAExssParser {
    int     nmixoutconfigs;         ///< Number of mixing configurations
    int     nmixoutchs[4];          ///< Speaker layout mask for mixer output channels

+    int     x_syncword_present;     ///< DTS:X extension syncword detected
+    int     x_imax_syncword_present;///< DTS:X IMAX extension syncword detected
+
    DCAExssAsset   assets[1];    ///< Audio asset descriptors
 } DCAExssParser;

--- a/libavcodec/defs.h
+++ b/libavcodec/defs.h
@ -87,6 +87,8 @@
 #define AV_PROFILE_DTS_ES             30
 #define AV_PROFILE_DTS_96_24          40
 #define AV_PROFILE_DTS_HD_HRA         50
+#define AV_PROFILE_DTS_HD_HRA_X       51
+#define AV_PROFILE_DTS_HD_HRA_X_IMAX  52
 #define AV_PROFILE_DTS_HD_MA          60
 #define AV_PROFILE_DTS_EXPRESS        70
 #define AV_PROFILE_DTS_HD_MA_X        61
--- a/libavcodec/dvdsubdec.c
+++ b/libavcodec/dvdsubdec.c
@ -146,16 +146,12 @@ static void guess_palette(DVDSubContext* ctx,
                          uint32_t *rgba_palette,
                          uint32_t subtitle_color)
 {
-    static const uint8_t level_map[4][4] = {
+    static const uint8_t level_map[4] = {
        // this configuration (full range, lowest to highest) in tests
        // seemed most common, so assume this
-        {0xff},
-        {0x00, 0xff},
-        {0x00, 0x80, 0xff},
-        {0x00, 0x55, 0xaa, 0xff},
+        0x00, 0xe0, 0x80, 0x20
    };
-    uint8_t color_used[16] = { 0 };
-    int nb_opaque_colors, i, level, j, r, g, b;
+    int i, level, r, g, b;
    uint8_t *colormap = ctx->colormap, *alpha = ctx->alpha;

    if(ctx->has_palette) {
@ -168,33 +164,13 @@ static void guess_palette(DVDSubContext* ctx,
    for(i = 0; i < 4; i++)
        rgba_palette[i] = 0;

-    nb_opaque_colors = 0;
-    for(i = 0; i < 4; i++) {
-        if (alpha[i] != 0 && !color_used[colormap[i]]) {
-            color_used[colormap[i]] = 1;
-            nb_opaque_colors++;
-        }
-    }
-
-    if (nb_opaque_colors == 0)
-        return;
-
-    j = 0;
-    memset(color_used, 0, 16);
    for(i = 0; i < 4; i++) {
        if (alpha[i] != 0) {
-            if (!color_used[colormap[i]])  {
-                level = level_map[nb_opaque_colors - 1][j];
-                r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
-                g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
-                b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
-                rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24);
-                color_used[colormap[i]] = (i + 1);
-                j++;
-            } else {
-                rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) |
-                                    ((alpha[i] * 17U) << 24);
-            }
+            level = level_map[i];
+            r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
+            g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
+            b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
+            rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24);
        }
    }
 }
@ -348,7 +324,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
            case 0xff:
                goto the_end;
            default:
-                ff_dlog(NULL, "unrecognised subpicture command 0x%x\n", cmd);
+                av_log(ctx, AV_LOG_WARNING, "unrecognised subpicture command 0x%x\n", cmd);
                goto the_end;
            }
        }
@ -356,6 +332,14 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
        if (offset1 >= buf_size || offset2 >= buf_size)
            goto fail;

+        /* store dvd palette info in subtitle struct for use by caller */
+        i = sub_header->num_dvd_palette++;
+        sub_header->dvd_palette = av_realloc(sub_header->dvd_palette, sizeof(AVSubtitleDVDPalette *) * (i+1));
+        sub_header->dvd_palette[i] = av_mallocz(sizeof(AVSubtitleDVDPalette));
+        sub_header->dvd_palette[i]->start_display_time = (date << 10) / 90;
+        memcpy(sub_header->dvd_palette[i]->colormap, colormap, 4);
+        memcpy(sub_header->dvd_palette[i]->alpha, alpha, 4);
+        /* parse rle subtitles */
        if (offset1 >= 0 && offset2 >= 0) {
            int w, h;
            uint8_t *bitmap;
--- a/libavcodec/dxva2.c
+++ b/libavcodec/dxva2.c
@ -754,14 +754,13 @@ static void *get_surface(const AVCodecContext *avctx, const AVFrame *frame)
 {
 #if CONFIG_D3D11VA
    if (frame->format == AV_PIX_FMT_D3D11) {
-        FFDXVASharedContext *sctx = DXVA_SHARED_CONTEXT(avctx);
+        AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
        intptr_t index = (intptr_t)frame->data[1];
-        if (index < 0 || index >= sctx->nb_d3d11_views ||
-            sctx->d3d11_texture != (ID3D11Texture2D *)frame->data[0]) {
+        if (index < 0 || index >= D3D11VA_CONTEXT(ctx)->surface_count) {
            av_log((void *)avctx, AV_LOG_ERROR, "get_buffer frame is invalid!\n");
            return NULL;
        }
-        return sctx->d3d11_views[index];
+        return D3D11VA_CONTEXT(ctx)->surface[index];
    }
 #endif
    return frame->data[3];
--- a/libavcodec/dxva2_av1.c
+++ b/libavcodec/dxva2_av1.c
@ -29,6 +29,10 @@
 #include "av1dec.h"
 #include "hwaccel_internal.h"

+#if !HAVE_DXVA_PICPARAMS_AV1
+#include "compat/windows/dxva_av1.h"
+#endif
+
 #define MAX_TILES 256

 struct AV1DXVAContext {
--- a/libavcodec/dxva2_hevc.c
+++ b/libavcodec/dxva2_hevc.c
@ -29,6 +29,10 @@
 #include "hevcdec.h"
 #include "hwaccel_internal.h"

+#if !HAVE_DXVA_PICPARAMS_HEVC
+#include "compat/windows/dxva_hevc.h"
+#endif
+
 #define MAX_SLICES 256

 struct hevc_dxva2_picture_context {
@ -164,7 +168,7 @@ void ff_dxva2_hevc_fill_picture_parameters(const AVCodecContext *avctx, AVDXVACo
    for (i = 0, j = 0; i < FF_ARRAY_ELEMS(pp->RefPicList); i++) {
        const HEVCFrame *frame = NULL;
        while (!frame && j < FF_ARRAY_ELEMS(h->DPB)) {
-            if (&h->DPB[j] != current_picture && (h->DPB[j].flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF)))
+            if (&h->DPB[j] != current_picture && (h->DPB[j].flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF)) && !h->DPB[j].missing)
                frame = &h->DPB[j];
            j++;
        }
--- a/libavcodec/dxva2_vp9.c
+++ b/libavcodec/dxva2_vp9.c
@ -29,6 +29,10 @@
 #include "hwaccel_internal.h"
 #include "vp9shared.h"

+#if !HAVE_DXVA_PICPARAMS_VP9
+#include "compat/windows/dxva_vpx.h"
+#endif
+
 struct vp9_dxva2_picture_context {
    DXVA_PicParams_VP9    pp;
    DXVA_Slice_VPx_Short  slice;
--- a/libavcodec/fraps.c
+++ b/libavcodec/fraps.c
@ -42,6 +42,7 @@
 #include "bswapdsp.h"
 #include "codec_internal.h"
 #include "thread.h"
+#include "threadframe.h"

 #define FPS_TAG MKTAG('F', 'P', 'S', 'x')
 #define VLC_BITS 11
@ -52,10 +53,15 @@
 typedef struct FrapsContext {
    AVCodecContext *avctx;
    BswapDSPContext bdsp;
+    int cur_index, prev_index;
+    int next_cur_index, next_prev_index;
+    ThreadFrame frames[2];
    uint8_t *tmpbuf;
    int tmpbuf_size;
 } FrapsContext;

+static av_cold int decode_end(AVCodecContext *avctx);
+
 /**
 * initializes decoder
 * @param avctx codec context
@ -64,12 +70,46 @@ typedef struct FrapsContext {
 static av_cold int decode_init(AVCodecContext *avctx)
 {
    FrapsContext * const s = avctx->priv_data;
+    int i;
+
+    s->prev_index = 0;
+    s->cur_index = 1;

    s->avctx  = avctx;
    s->tmpbuf = NULL;

    ff_bswapdsp_init(&s->bdsp);

+    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
+        s->frames[i].f = av_frame_alloc();
+        if (!s->frames[i].f) {
+            decode_end(avctx);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static int update_thread_context(AVCodecContext *avctx, const AVCodecContext *avctx_from)
+{
+    FrapsContext *dst = avctx->priv_data, *src = avctx_from->priv_data;
+    int i, ret;
+
+    if (avctx == avctx_from) return 0;
+
+    dst->cur_index  = src->next_cur_index;
+    dst->prev_index = src->next_prev_index;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(dst->frames); i++) {
+        ff_thread_release_ext_buffer(&dst->frames[i]);
+        if (src->frames[i].f->data[0]) {
+            ret = ff_thread_ref_frame(&dst->frames[i], &src->frames[i]);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
    return 0;
 }

@ -132,18 +172,52 @@ static int fraps2_decode_plane(FrapsContext *s, uint8_t *dst, int stride, int w,
    return 0;
 }

-static int decode_frame(AVCodecContext *avctx, AVFrame *f,
+static void frame_copy(FrapsContext *s,
+                       uint8_t *dst_data[3], const int dst_linesizes[3],
+                       uint8_t *src_data[3], const int src_linesizes[3],
+                       unsigned int version, int width, int height)
+{
+    int i, k, h, bwidth;
+    uint8_t *src, *dst;
+    int planes = (version & 1) ? 1 : 3;
+
+    for (i = 0; i < planes; i++) {
+        dst = dst_data[i];
+        src = src_data[i];
+        if (version & 1) {
+            /* RGB data */
+            h = height;
+            bwidth = width * 3;
+        } else {
+            /* YUV 4:2:0 data */
+            h = i ? height >> 1 : height;
+            bwidth = i ? width >> 1 : width;
+        }
+
+        ff_thread_await_progress(&s->frames[s->prev_index], i, 0);
+        for (k = 0; k < h; k++) {
+            memcpy(dst, src, bwidth);
+            dst += dst_linesizes[i];
+            src += src_linesizes[i];
+        }
+        ff_thread_report_progress(&s->frames[s->cur_index], i, 0);
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx, AVFrame *fout,
                        int *got_frame, AVPacket *avpkt)
 {
    FrapsContext * const s = avctx->priv_data;
    const uint8_t *buf     = avpkt->data;
    int buf_size           = avpkt->size;
+    ThreadFrame *frame, *prev_frame;
+    AVFrame *f;
    uint32_t header;
    unsigned int version,header_size;
    const uint32_t *buf32;
    uint32_t *luma1,*luma2,*cb,*cr;
    uint32_t offs[4];
-    int i, j, ret, is_chroma;
+    int i, j, ret, is_chroma, is_Pframe;
    const int planes = 3;
    int is_pal;
    uint8_t *out;
@ -153,6 +227,10 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
        return AVERROR_INVALIDDATA;
    }

+    frame = &s->frames[s->cur_index];
+    prev_frame = &s->frames[s->prev_index];
+    f = frame->f;
+
    header      = AV_RL32(buf);
    version     = header & 0xff;
    is_pal      = buf[1] == 2 && version == 1;
@ -179,22 +257,16 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
        if (version == 0) needed_size /= 2;
        needed_size += header_size;
        /* bit 31 means same as previous pic */
-        if (header & (1U<<31)) {
-            *got_frame = 0;
-            return buf_size;
-        }
-        if (buf_size != needed_size) {
+        is_Pframe = (header & (1U<<31)) ? 1 : 0;
+        if (!is_Pframe && buf_size != needed_size) {
            av_log(avctx, AV_LOG_ERROR,
                   "Invalid frame length %d (should be %d)\n",
                   buf_size, needed_size);
            return AVERROR_INVALIDDATA;
        }
    } else {
-        /* skip frame */
-        if (buf_size == 8) {
-            *got_frame = 0;
-            return buf_size;
-        }
+        is_Pframe = buf_size == 8 ? 1 : 0;
+        if (!is_Pframe) {
        if (AV_RL32(buf) != FPS_TAG || buf_size < planes*1024 + 24) {
            av_log(avctx, AV_LOG_ERROR, "error in data stream\n");
            return AVERROR_INVALIDDATA;
@ -212,19 +284,43 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
            if (!s->tmpbuf)
                return AVERROR(ENOMEM);
        }
+        }
    }

-    f->pict_type = AV_PICTURE_TYPE_I;
-    f->flags |= AV_FRAME_FLAG_KEY;
+    if (is_Pframe && !prev_frame->f->data[0]) {
+        av_log(avctx, AV_LOG_ERROR, "decoding must start with keyframe\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ff_thread_release_ext_buffer(frame);
+
+    f->pict_type = is_Pframe ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
+    f->flags |= is_Pframe ? 0 : AV_FRAME_FLAG_KEY;;

    avctx->pix_fmt = version & 1 ? is_pal ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_BGR24 : AV_PIX_FMT_YUVJ420P;
    avctx->color_range = version & 1 ? AVCOL_RANGE_UNSPECIFIED
                                     : AVCOL_RANGE_JPEG;
    avctx->colorspace = version & 1 ? AVCOL_SPC_UNSPECIFIED : AVCOL_SPC_BT709;

-    if ((ret = ff_thread_get_buffer(avctx, f, 0)) < 0)
+    if ((ret = ff_thread_get_ext_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
        return ret;

+    s->next_prev_index = s->cur_index;
+    s->next_cur_index  = (s->cur_index - 1) & 1;
+
+    ff_thread_finish_setup(avctx);
+
+    /* Copy previous frame */
+    if (is_Pframe) {
+        frame_copy(s,
+                   frame->f->data,
+                   frame->f->linesize,
+                   prev_frame->f->data,
+                   prev_frame->f->linesize,
+                   version, avctx->width, avctx->height);
+        goto end;
+    }
+
    switch (version) {
    case 0:
    default:
@ -250,6 +346,7 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
                *cb++    = *buf32++;
            }
        }
+        ff_thread_report_progress(frame, INT_MAX, 0);
        break;

    case 1:
@ -272,6 +369,7 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
                       &buf[y * avctx->width * 3],
                       3 * avctx->width);
        }
+        ff_thread_report_progress(frame, INT_MAX, 0);
        break;

    case 2:
@ -288,8 +386,13 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
                                           buf + offs[i], offs[i + 1] - offs[i],
                                           is_chroma, 1)) < 0) {
                av_log(avctx, AV_LOG_ERROR, "Error decoding plane %i\n", i);
-                return ret;
-            }
+                if (avctx->active_thread_type & FF_THREAD_FRAME) {
+                    ff_thread_report_progress(frame, INT_MAX, 0);
+                    break;
+                } else
+                    return ret;
+            } else
+                ff_thread_report_progress(frame, i, 0);
        }
        break;
    case 3:
@ -300,7 +403,10 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
                                           -f->linesize[0], avctx->width, avctx->height,
                                           buf + offs[i], offs[i + 1] - offs[i], 0, 3)) < 0) {
                av_log(avctx, AV_LOG_ERROR, "Error decoding plane %i\n", i);
-                return ret;
+                if (avctx->active_thread_type & FF_THREAD_FRAME)
+                    break;
+                else
+                    return ret;
            }
        }
        out = f->data[0];
@ -314,11 +420,21 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
            }
            out += f->linesize[0] - 3*avctx->width;
        }
+        ff_thread_report_progress(frame, INT_MAX, 0);
        break;
    }

+end:
+    if ((ret = av_frame_ref(fout, frame->f)) < 0)
+        return ret;
    *got_frame = 1;

+    s->prev_index = s->next_prev_index;
+    s->cur_index  = s->next_cur_index;
+
+    /* Only release frames that aren't used anymore */
+    ff_thread_release_ext_buffer(&s->frames[s->cur_index]);
+
    return buf_size;
 }

@ -330,8 +446,16 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
 static av_cold int decode_end(AVCodecContext *avctx)
 {
    FrapsContext *s = avctx->priv_data;
+    int i;

    av_freep(&s->tmpbuf);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
+        if (s->frames[i].f)
+            ff_thread_release_ext_buffer(&s->frames[i]);
+        av_frame_free(&s->frames[i].f);
+    }
+
    return 0;
 }

@ -345,4 +469,6 @@ const FFCodec ff_fraps_decoder = {
    .close          = decode_end,
    FF_CODEC_DECODE_CB(decode_frame),
    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .caps_internal = FF_CODEC_CAP_ALLOCATE_PROGRESS,
+    UPDATE_THREAD_CONTEXT(update_thread_context),
 };
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@ -60,11 +60,13 @@ typedef struct H264ParseContext {
    int nal_length_size;
    int got_first;
    int picture_structure;
-    uint8_t parse_history[6];
+    uint8_t parse_history[9];
    int parse_history_count;
    int parse_last_mb;
    int64_t reference_dts;
    int last_frame_num, last_picture_structure;
+    int is_mvc;
+    int slice_ext;
 } H264ParseContext;

 static int find_start_code(const uint8_t *buf, int buf_size,
@ -122,14 +124,17 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
        } else if (state <= 5) {
            int nalu_type = buf[i] & 0x1F;
            if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS ||
-                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) {
+                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD ||
+                nalu_type == H264_NAL_SUB_SPS) {
                if (pc->frame_start_found) {
                    i++;
                    goto found;
                }
            } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
-                       nalu_type == H264_NAL_IDR_SLICE) {
+                       nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_EXTEN_SLICE)) {
                state += 8;
+
+                p->slice_ext = (nalu_type == H264_NAL_EXTEN_SLICE);
                continue;
            }
            state = 7;
@ -138,20 +143,22 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
            GetBitContext gb;
            p->parse_history[p->parse_history_count++] = buf[i];

-            init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
-            mb= get_ue_golomb_long(&gb);
-            if (get_bits_left(&gb) > 0 || p->parse_history_count > 5) {
-                p->parse_last_mb = mb;
-                if (pc->frame_start_found) {
-                    if (mb <= last_mb) {
-                        i -= p->parse_history_count - 1;
-                        p->parse_history_count = 0;
-                        goto found;
-                    }
-                } else
-                    pc->frame_start_found = 1;
-                p->parse_history_count = 0;
-                state = 7;
+            if (!p->slice_ext || p->parse_history_count > 3) {
+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
+                mb= get_ue_golomb_long(&gb);
+                if (get_bits_left(&gb) > 0 || p->parse_history_count > (5 + 3*p->slice_ext)) {
+                    p->parse_last_mb = mb;
+                    if (pc->frame_start_found) {
+                        if (mb <= last_mb) {
+                            i -= p->parse_history_count - 1;
+                            p->parse_history_count = 0;
+                            goto found;
+                        }
+                    } else
+                        pc->frame_start_found = 1;
+                    p->parse_history_count = 0;
+                    state = 7;
+                }
            }
        }
    }
@ -605,6 +612,9 @@ static int h264_parse(AVCodecParserContext *s,
    } else {
        next = h264_find_frame_end(p, buf, buf_size, avctx);

+        if (next == END_NOT_FOUND && pc->frame_start_found == 0)
+            s->fetch_timestamp = 1;
+
        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
            *poutbuf      = NULL;
            *poutbuf_size = 0;
@ -617,7 +627,8 @@ static int h264_parse(AVCodecParserContext *s,
        }
    }

-    parse_nal_units(s, avctx, buf, buf_size);
+    if (!p->is_mvc)
+        parse_nal_units(s, avctx, buf, buf_size);

    if (avctx->framerate.num)
        time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){2, 1}));
@ -688,3 +699,22 @@ const AVCodecParser ff_h264_parser = {
    .parser_parse   = h264_parse,
    .parser_close   = h264_close,
 };
+
+static av_cold int init_mvc(AVCodecParserContext *s)
+{
+    H264ParseContext *p = s->priv_data;
+    int ret = init(s);
+    if (ret < 0)
+        return ret;
+
+    p->is_mvc = 1;
+    return 0;
+}
+
+AVCodecParser ff_h264_mvc_parser = {
+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
+    .priv_data_size = sizeof(H264ParseContext),
+    .parser_init    = init_mvc,
+    .parser_parse   = h264_parse,
+    .parser_close   = h264_close,
+};
--- a/libavcodec/h264_slice.c
+++ b/libavcodec/h264_slice.c
@ -470,6 +470,7 @@ int ff_h264_update_thread_context_for_user(AVCodecContext *dst,

    h->is_avc = h1->is_avc;
    h->nal_length_size = h1->nal_length_size;
+    h->x264_build = h1->x264_build;

    return 0;
 }
@ -866,14 +867,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
        if (CHROMA444(h)) {
            if (h->avctx->colorspace == AVCOL_SPC_RGB)
                *fmt++ = AV_PIX_FMT_GBRP;
-            else if (h->avctx->color_range == AVCOL_RANGE_JPEG)
-                *fmt++ = AV_PIX_FMT_YUVJ444P;
            else
                *fmt++ = AV_PIX_FMT_YUV444P;
        } else if (CHROMA422(h)) {
-            if (h->avctx->color_range == AVCOL_RANGE_JPEG)
-                *fmt++ = AV_PIX_FMT_YUVJ422P;
-            else
                *fmt++ = AV_PIX_FMT_YUV422P;
        } else {
 #if CONFIG_H264_DXVA2_HWACCEL
@ -889,9 +885,6 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
 #if CONFIG_H264_VAAPI_HWACCEL
            *fmt++ = AV_PIX_FMT_VAAPI;
 #endif
-            if (h->avctx->color_range == AVCOL_RANGE_JPEG)
-                *fmt++ = AV_PIX_FMT_YUVJ420P;
-            else
                *fmt++ = AV_PIX_FMT_YUV420P;
        }
        break;
@ -1091,6 +1084,7 @@ static int h264_init_ps(H264Context *h, const H264SliceContext *sl, int first_sl
        h->avctx->profile = ff_h264_get_profile(sps);
        h->avctx->level   = sps->level_idc;
        h->avctx->refs    = sps->ref_frame_count;
+        h->avctx->progressive_sequence = sps->frame_mbs_only_flag;

        h->mb_width  = sps->mb_width;
        h->mb_height = sps->mb_height;
@ -1190,6 +1184,7 @@ static int h264_export_frame_props(H264Context *h)
        const H264SEIPictureTiming *pt = &h->sei.picture_timing;
        switch (pt->pic_struct) {
        case H264_SEI_PIC_STRUCT_FRAME:
+            interlaced_frame = FIELD_OR_MBAFF_PICTURE(h);
            break;
        case H264_SEI_PIC_STRUCT_TOP_FIELD:
        case H264_SEI_PIC_STRUCT_BOTTOM_FIELD:
--- a/libavcodec/h264dec.c
+++ b/libavcodec/h264dec.c
@ -417,6 +417,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
               ret = 0;
           }
        }
+
+        /* activate the first SPS to determine basic stream information */
+        if (!h->ps.sps) {
+            int i;
+            for (i = 0; i < FF_ARRAY_ELEMS(h->ps.pps_list) && !h->ps.sps; i++) {
+                if (h->ps.pps_list[i]) {
+                    ff_refstruct_replace(&h->ps.pps, h->ps.pps_list[i]);
+                    h->ps.sps = h->ps.pps->sps;
+                }
+            }
+        }
    }

    if (h->ps.sps) {
@ -508,9 +519,6 @@ static void h264_decode_flush(AVCodecContext *avctx)
    h->mb_y = 0;
    h->non_gray = 0;

-    ff_h264_free_tables(h);
-    h->context_initialized = 0;
-
    if (FF_HW_HAS_CB(avctx, flush))
        FF_HW_SIMPLE_CALL(avctx, flush);
 }
--- a/libavcodec/h264dec.h
+++ b/libavcodec/h264dec.h
@ -58,7 +58,7 @@
 * The maximum number of slices supported by the decoder.
 * must be a power of 2
 */
-#define MAX_SLICES 32
+#define MAX_SLICES 256

 #ifdef ALLOW_INTERLACE
 #define MB_MBAFF(h)    (h)->mb_mbaff
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@ -442,20 +442,13 @@ static int decode_hrd(GetBitContext *gb, int common_inf_present,
    return 0;
 }

-static void uninit_vps(FFRefStructOpaque opaque, void *obj)
-{
-    HEVCVPS *vps = obj;
-
-    av_freep(&vps->hdr);
-}
-
 int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
                           HEVCParamSets *ps)
 {
    int i,j;
    int vps_id = 0;
    ptrdiff_t nal_size;
-    HEVCVPS *vps = ff_refstruct_alloc_ext(sizeof(*vps), 0, NULL, uninit_vps);
+    HEVCVPS *vps = ff_refstruct_allocz(sizeof(*vps));

    if (!vps)
        return AVERROR(ENOMEM);
@ -544,11 +537,6 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
            goto err;
        }
-
-        vps->hdr = av_calloc(vps->vps_num_hrd_parameters, sizeof(*vps->hdr));
-        if (!vps->hdr)
-            goto err;
-
        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
            int common_inf_present = 1;

@ -593,8 +581,6 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
    ff_h2645_decode_common_vui_params(gb, &sps->vui.common, avctx);

    if (vui->common.video_signal_type_present_flag) {
-        if (vui->common.video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
-            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
        if (vui->common.colour_description_present_flag) {
            if (vui->common.matrix_coeffs == AVCOL_SPC_RGB) {
                switch (sps->pix_fmt) {
--- a/libavcodec/hevc_ps.h
+++ b/libavcodec/hevc_ps.h
@ -153,7 +153,7 @@ typedef struct PTL {

 typedef struct HEVCVPS {
    unsigned int vps_id;
-    HEVCHdrParams *hdr;
+    HEVCHdrParams hdr[HEVC_MAX_LAYER_SETS];

    uint8_t vps_temporal_id_nesting_flag;
    int vps_max_layers;
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@ -50,6 +50,8 @@ void ff_hevc_unref_frame(HEVCFrame *frame, int flags)
        frame->refPicList = NULL;

        ff_refstruct_unref(&frame->hwaccel_picture_private);
+
+        frame->missing = 0;
    }
 }

@ -440,6 +442,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
    frame->poc      = poc;
    frame->sequence = HEVC_SEQUENCE_COUNTER_INVALID;
    frame->flags    = 0;
+    frame->missing  = 1;

    if (s->threads_type == FF_THREAD_FRAME)
        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@ -29,6 +29,7 @@
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/film_grain_params.h"
+#include "libavutil/hdr_dynamic_metadata.h"
 #include "libavutil/internal.h"
 #include "libavutil/md5.h"
 #include "libavutil/opt.h"
@ -2817,7 +2818,30 @@ static int set_side_data(HEVCContext *s)
    }

    if (s->sei.common.dynamic_hdr_plus.info) {
-        AVBufferRef *info_ref = av_buffer_ref(s->sei.common.dynamic_hdr_plus.info);
+        AVBufferRef *info_ref;
+        AVDynamicHDRPlus *metadata = (AVDynamicHDRPlus*)s->sei.common.dynamic_hdr_plus.info->data;
+
+        // fill in window 0 (full frame) and convert to relative coordinates
+        if (metadata->params[0].window_lower_right_corner_x.num == 0)
+        {
+            // ensure the buffer is writable
+            av_buffer_make_writable(&s->sei.common.dynamic_hdr_plus.info);
+            metadata = (AVDynamicHDRPlus*)s->sei.common.dynamic_hdr_plus.info->data;
+
+            // Convert coordinates to relative coordinate in [0, 1].
+            metadata->params[0].window_upper_left_corner_x.num  = 0;
+            metadata->params[0].window_upper_left_corner_y.num  = 0;
+            metadata->params[0].window_lower_right_corner_x.num = out->width - 1;
+            metadata->params[0].window_lower_right_corner_y.num = out->height - 1;
+            for (int w = 0; w < metadata->num_windows; w++) {
+                metadata->params[w].window_upper_left_corner_x.den = out->width - 1;
+                metadata->params[w].window_upper_left_corner_y.den = out->height - 1;
+                metadata->params[w].window_lower_right_corner_x.den = out->width - 1;
+                metadata->params[w].window_lower_right_corner_y.den = out->height - 1;
+            }
+        }
+
+        info_ref = av_buffer_ref(s->sei.common.dynamic_hdr_plus.info);
        if (!info_ref)
            return AVERROR(ENOMEM);

@ -3654,7 +3678,8 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
        if (avctx->extradata_size > 0 && avctx->extradata) {
            ret = hevc_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
            if (ret < 0) {
-                return ret;
+                s->is_nalff = 0;
+                av_log(avctx, AV_LOG_ERROR, "Invalid extradata ignored\n");
            }
        }

--- a/libavcodec/hevcdec.h
+++ b/libavcodec/hevcdec.h
@ -377,6 +377,11 @@ typedef struct HEVCFrame {
     * A combination of HEVC_FRAME_FLAG_*
     */
    uint8_t flags;
+
+    /**
+     * 1 - a dummy frame generated in place of a missing frame
+     */
+    int missing;
 } HEVCFrame;

 typedef struct HEVCLocalContext {
--- a/libavcodec/hevcpred.c
+++ b/libavcodec/hevcpred.c
@ -78,4 +78,7 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
 #if ARCH_MIPS
    ff_hevc_pred_init_mips(hpc, bit_depth);
 #endif
+#if ARCH_X86
+    ff_hevc_pred_init_x86(hpc, bit_depth);
+#endif
 }
--- a/libavcodec/hevcpred.h
+++ b/libavcodec/hevcpred.h
@ -42,5 +42,6 @@ typedef struct HEVCPredContext {

 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
 void ff_hevc_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
+void ff_hevc_pred_init_x86(HEVCPredContext *hpc, int bit_depth);

 #endif /* AVCODEC_HEVCPRED_H */
--- a/libavcodec/hwconfig.h
+++ b/libavcodec/hwconfig.h
@ -64,7 +64,7 @@ void ff_hwaccel_uninit(AVCodecContext *avctx);
 #define HWACCEL_DXVA2(codec) \
    HW_CONFIG_HWACCEL(1, 1, 1, DXVA2_VLD,    DXVA2,        ff_ ## codec ## _dxva2_hwaccel)
 #define HWACCEL_D3D11VA2(codec) \
-    HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
+    HW_CONFIG_HWACCEL(1, 1, 1, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
 #define HWACCEL_NVDEC(codec) \
    HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
 #define HWACCEL_VAAPI(codec) \
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@ -84,6 +84,7 @@ typedef struct Mpeg1Context {
    unsigned aspect_ratio_info;
    AVRational save_aspect;
    int save_width, save_height, save_progressive_seq;
+    enum AVCodecID save_codec_id;
    AVRational frame_rate_ext;  /* MPEG-2 specific framerate modificator */
    unsigned frame_rate_index;
    int sync;                   /* Did we reach a sync point like a GOP/SEQ/KEYFrame? */
@ -795,9 +796,6 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
    Mpeg1Context *s    = avctx->priv_data;
    MpegEncContext *s2 = &s->mpeg_enc_ctx;

-    if (   avctx->codec_tag != AV_RL32("VCR2")
-        && avctx->codec_tag != AV_RL32("BW10"))
-        avctx->coded_width = avctx->coded_height = 0; // do not trust dimensions from input
    ff_mpv_decode_init(s2, avctx);

    ff_mpeg12_init_vlcs();
@ -968,6 +966,7 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
        s1->save_height          != s->height               ||
        av_cmp_q(s1->save_aspect, s->avctx->sample_aspect_ratio) ||
        (s1->save_progressive_seq != s->progressive_sequence && FFALIGN(s->height, 16) != FFALIGN(s->height, 32)) ||
+        s1->save_codec_id        != s->codec_id             ||
        0) {
        if (s1->mpeg_enc_ctx_allocated) {
            ff_mpv_common_end(s);
@ -989,6 +988,7 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
        s1->save_width           = s->width;
        s1->save_height          = s->height;
        s1->save_progressive_seq = s->progressive_sequence;
+        s1->save_codec_id        = s->codec_id;

        /* low_delay may be forced, in this case we will have B-frames
         * that behave like P-frames. */
@ -1021,7 +1021,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
            case 1: avctx->chroma_sample_location = AVCHROMA_LOC_LEFT; break;
            case 2:
            case 3: avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT; break;
-            default: av_assert0(0);
            }
        } // MPEG-2

@ -1091,6 +1090,7 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
    skip_bits(&s->gb, 1); /* profile and level esc*/
    s->avctx->profile       = get_bits(&s->gb, 3);
    s->avctx->level         = get_bits(&s->gb, 4);
+    s->avctx->progressive_sequence =
    s->progressive_sequence = get_bits1(&s->gb);   /* progressive_sequence */
    s->chroma_format        = get_bits(&s->gb, 2); /* chroma_format 1=420, 2=422, 3=444 */

@ -1841,6 +1841,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
    s->height = height;

    /* We set MPEG-2 parameters so that it emulates MPEG-1. */
+    s->avctx->progressive_sequence =
    s->progressive_sequence = 1;
    s->progressive_frame    = 1;
    s->picture_structure    = PICT_FRAME;
@ -1894,6 +1895,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
        s->chroma_inter_matrix[j] = v;
    }

+    s->avctx->progressive_sequence =
    s->progressive_sequence  = 1;
    s->progressive_frame     = 1;
    s->picture_structure     = PICT_FRAME;
@ -1908,6 +1910,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
    s1->save_width           = s->width;
    s1->save_height          = s->height;
    s1->save_progressive_seq = s->progressive_sequence;
+    s1->save_codec_id        = s->codec_id;
    return 0;
 }

--- a/libavcodec/mpeg4audio.h
+++ b/libavcodec/mpeg4audio.h
@ -39,6 +39,7 @@ typedef struct MPEG4AudioConfig {
    int channels;
    int ps;  ///< -1 implicit, 1 presence
    int frame_length_short;
+    int pce;
 } MPEG4AudioConfig;

 extern const int     ff_mpeg4audio_sample_rates[16];
--- a/libavcodec/mpegvideo_dec.c
+++ b/libavcodec/mpegvideo_dec.c
@ -344,8 +344,10 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
            pic->reference = 3;
    }

-    if (alloc_picture(s, pic) < 0)
+    if (alloc_picture(s, pic) < 0) {
+        s->current_picture_ptr = NULL;
        return -1;
+    }

    s->current_picture_ptr = pic;
    // FIXME use only the vars from current_pic
--- a/libavcodec/mpegvideo_parser.c
+++ b/libavcodec/mpegvideo_parser.c
@ -75,9 +75,15 @@ static int mpeg1_find_frame_end(ParseContext *pc, const uint8_t *buf,
                pc->frame_start_found = 4;
            }
            if (state == SEQ_END_CODE) {
+                int idx = i + 1;
+                /* DVDs won't send the next frame start on still images */
+                /* SEQ_END_CODE will have to stay at the beginning of the next frame */
+                if (pc->frame_start_found && i != 3) {
+                    idx = i - 3;
+                }
                pc->frame_start_found = 0;
                pc->state = -1;
-                return i + 1;
+                return idx;
            }
            if (pc->frame_start_found == 2 && state == SEQ_START_CODE)
                pc->frame_start_found = 0;
--- a/libavcodec/packet.h
+++ b/libavcodec/packet.h
@ -68,6 +68,9 @@ enum AVPacketSideDataType {
     * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS)
     *     s32le width
     *     s32le height
+     * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_ASPECTRATIO)
+     *     s32le num
+     *     s32le den
     * @endcode
     */
    AV_PKT_DATA_PARAM_CHANGE,
@ -596,8 +599,11 @@ typedef struct AVPacketList {
 #define AV_PKT_FLAG_DISPOSABLE 0x0010

 enum AVSideDataParamChangeFlags {
+    AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT  = 0x0001,
+    AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT = 0x0002,
    AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE    = 0x0004,
    AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS     = 0x0008,
+    AV_SIDE_DATA_PARAM_CHANGE_ASPECTRATIO    = 0x8000,
 };

 /**
--- a/libavcodec/parsers.c
+++ b/libavcodec/parsers.c
@ -51,6 +51,7 @@ extern const AVCodecParser ff_gsm_parser;
 extern const AVCodecParser ff_h261_parser;
 extern const AVCodecParser ff_h263_parser;
 extern const AVCodecParser ff_h264_parser;
+extern const AVCodecParser ff_h264_mvc_parser;
 extern const AVCodecParser ff_hevc_parser;
 extern const AVCodecParser ff_hdr_parser;
 extern const AVCodecParser ff_ipu_parser;
--- a/libavcodec/profiles.c
+++ b/libavcodec/profiles.c
@ -40,6 +40,8 @@ const AVProfile ff_dca_profiles[] = {
    { AV_PROFILE_DTS_ES,             "DTS-ES"                 },
    { AV_PROFILE_DTS_96_24,          "DTS 96/24"              },
    { AV_PROFILE_DTS_HD_HRA,         "DTS-HD HRA"             },
+    { AV_PROFILE_DTS_HD_HRA_X,       "DTS-HD HRA + DTS:X"     },
+    { AV_PROFILE_DTS_HD_HRA_X_IMAX,  "DTS-HD HRA + DTS:X IMAX"},
    { AV_PROFILE_DTS_HD_MA,          "DTS-HD MA"              },
    { AV_PROFILE_DTS_HD_MA_X,        "DTS-HD MA + DTS:X"      },
    { AV_PROFILE_DTS_HD_MA_X_IMAX,   "DTS-HD MA + DTS:X IMAX" },
@ -83,6 +85,7 @@ const AVProfile ff_h264_profiles[] = {
    { AV_PROFILE_H264_CAVLC_444,            "CAVLC 4:4:4"           },
    { AV_PROFILE_H264_MULTIVIEW_HIGH,       "Multiview High"        },
    { AV_PROFILE_H264_STEREO_HIGH,          "Stereo High"           },
+    { FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH, "Multiview High Depth"  },
    { AV_PROFILE_UNKNOWN },
 };

--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@ -315,6 +315,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
        if (err < 0)
            return err;

+        dst->progressive_sequence = src->progressive_sequence;
+
        if (!!dst->hw_frames_ctx != !!src->hw_frames_ctx ||
            (dst->hw_frames_ctx && dst->hw_frames_ctx->data != src->hw_frames_ctx->data)) {
            av_buffer_unref(&dst->hw_frames_ctx);
--- a/libavcodec/s302m.c
+++ b/libavcodec/s302m.c
@ -203,6 +203,7 @@ static int s302m_decode_frame(AVCodecContext *avctx, AVFrame *frame,
    }

    avctx->sample_rate = 48000;
+    avctx->codec_tag = non_pcm_data_type;

    *got_frame_ptr = 1;

@ -211,7 +212,7 @@ static int s302m_decode_frame(AVCodecContext *avctx, AVFrame *frame,

 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_DECODING_PARAM
 static const AVOption s302m_options[] = {
-    {"non_pcm_mode", "Chooses what to do with NON-PCM", offsetof(S302Context, non_pcm_mode), AV_OPT_TYPE_INT, {.i64 = 3}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
+    {"non_pcm_mode", "Chooses what to do with NON-PCM", offsetof(S302Context, non_pcm_mode), AV_OPT_TYPE_INT, {.i64 = 2}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
    {"copy"        , "Pass NON-PCM through unchanged"     , 0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
    {"drop"        , "Drop NON-PCM"                       , 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
    {"decode_copy" , "Decode if possible else passthrough", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@ -27,7 +27,7 @@
 #include <inttypes.h>
 #include <stdlib.h>

-#define CACHED_BITSTREAM_READER !ARCH_X86_32
+#define CACHED_BITSTREAM_READER 0 /* cached reader is broken with get_bits_le used below */
 #define UNCHECKED_BITSTREAM_READER 1

 #include "libavutil/intreadwrite.h"
--- a/libavcodec/vc1.h
+++ b/libavcodec/vc1.h
@ -396,6 +396,8 @@ typedef struct VC1Context{

    int parse_only;              ///< Context is used within parser
    int resync_marker;           ///< could this stream contain resync markers
+
+    int recovered;
 } VC1Context;

 /**
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@ -1045,6 +1045,13 @@ static int vc1_decode_frame(AVCodecContext *avctx, AVFrame *pict,
        goto err;
    }

+    if (!v->recovered && !(avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL)) {
+        if (s->pict_type == AV_PICTURE_TYPE_I)
+            v->recovered = 1;
+        else
+            goto err;
+    }
+
    /* skip B-frames if we don't have reference frames */
    if (!s->last_picture_ptr && s->pict_type == AV_PICTURE_TYPE_B) {
        av_log(v->s.avctx, AV_LOG_DEBUG, "Skipping B frame without reference frames\n");
@ -1381,6 +1388,14 @@ err:
    return ret;
 }

+static void vc1_decode_flush(AVCodecContext *avctx)
+{
+    VC1Context *v = avctx->priv_data;
+
+    ff_mpeg_flush(avctx);
+
+    v->recovered = 0;
+}

 const FFCodec ff_vc1_decoder = {
    .p.name         = "vc1",
@ -1391,7 +1406,7 @@ const FFCodec ff_vc1_decoder = {
    .init           = vc1_decode_init,
    .close          = ff_vc1_decode_end,
    FF_CODEC_DECODE_CB(vc1_decode_frame),
-    .flush          = ff_mpeg_flush,
+    .flush          = vc1_decode_flush,
    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
    .hw_configs     = (const AVCodecHWConfigInternal *const []) {
 #if CONFIG_VC1_DXVA2_HWACCEL
@ -1430,7 +1445,7 @@ const FFCodec ff_wmv3_decoder = {
    .init           = vc1_decode_init,
    .close          = ff_vc1_decode_end,
    FF_CODEC_DECODE_CB(vc1_decode_frame),
-    .flush          = ff_mpeg_flush,
+    .flush          = vc1_decode_flush,
    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
    .hw_configs     = (const AVCodecHWConfigInternal *const []) {
 #if CONFIG_WMV3_DXVA2_HWACCEL
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@ -1341,9 +1341,6 @@ static int decode_tiles(AVCodecContext *avctx,
                        decode_sb_mem(td, row, col, lflvl_ptr,
                                      yoff2, uvoff2, BL_64X64);
                    } else {
-                        if (vpx_rac_is_end(td->c)) {
-                            return AVERROR_INVALIDDATA;
-                        }
                        decode_sb(td, row, col, lflvl_ptr,
                                  yoff2, uvoff2, BL_64X64);
                    }
--- a/libavcodec/vvc/vvcdec.c
+++ b/libavcodec/vvc/vvcdec.c
@ -1040,8 +1040,7 @@ const FFCodec ff_vvc_decoder = {
    .close          = vvc_decode_free,
    FF_CODEC_DECODE_CB(vvc_decode_frame),
    .flush          = vvc_decode_flush,
-    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_OTHER_THREADS |
-                      AV_CODEC_CAP_EXPERIMENTAL,
+    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_OTHER_THREADS,
    .caps_internal  = FF_CODEC_CAP_EXPORTS_CROPPING | FF_CODEC_CAP_INIT_CLEANUP |
                      FF_CODEC_CAP_AUTO_THREADS,
    .p.profiles     = NULL_IF_CONFIG_SMALL(ff_vvc_profiles),
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@ -169,7 +169,9 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER)     += x86/hevc_add_res.o            \
                                          x86/hevc_mc.o                 \
                                          x86/h26x/h2656_inter.o        \
                                          x86/hevc_sao.o                \
-                                          x86/hevc_sao_10bit.o
+                                          x86/hevc_sao_10bit.o          \
+                                          x86/hevc_idct_intrinsic.o     \
+                                          x86/hevc_intra_intrinsic.o
 X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
 X86ASM-OBJS-$(CONFIG_LSCR_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_MLP_DECODER)      += x86/mlpdsp.o
--- a/libavcodec/x86/hevc_idct_intrinsic.c
+++ b/libavcodec/x86/hevc_idct_intrinsic.c
@ -0,0 +1,716 @@
+#include "config.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavcodec/hevc.h"
+#include "libavcodec/x86/hevcdsp.h"
+
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#endif
+
+#if HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t, transform4x4_luma[8][8] )=
+{
+    {   29, +84, 29,  +84,  29, +84,  29, +84 },
+    {  +74, +55, +74, +55, +74, +55, +74, +55 },
+    {   55, -29,  55, -29,  55, -29,  55, -29 },
+    {  +74, -84, +74, -84, +74, -84, +74, -84 },
+    {   74, -74,  74, -74,  74, -74,  74, -74 },
+    {    0, +74,   0, +74,   0, +74,   0, +74 },
+    {   84, +55,  84, +55,  84, +55,  84, +55 },
+    {  -74, -29, -74, -29, -74, -29, -74, -29 }
+};
+
+DECLARE_ALIGNED( 16, static const int16_t, transform4x4[4][8] ) = {
+    { 64,  64, 64,  64, 64,  64, 64,  64 },
+    { 64, -64, 64, -64, 64, -64, 64, -64 },
+    { 83,  36, 83,  36, 83,  36, 83,  36 },
+    { 36, -83, 36, -83, 36, -83, 36, -83 }
+};
+
+DECLARE_ALIGNED(16, static const int16_t, transform8x8[12][1][8] )=
+{
+    {{  89,  75,  89,  75, 89,  75, 89,  75 }},
+    {{  50,  18,  50,  18, 50,  18, 50,  18 }},
+    {{  75, -18,  75, -18, 75, -18, 75, -18 }},
+    {{ -89, -50, -89, -50,-89, -50,-89, -50 }},
+    {{  50, -89,  50, -89, 50, -89, 50, -89 }},
+    {{  18,  75,  18,  75, 18,  75, 18,  75 }},
+    {{  18, -50,  18, -50, 18, -50, 18, -50 }},
+    {{  75, -89,  75, -89, 75, -89, 75, -89 }},
+    {{  64,  64,  64,  64, 64,  64, 64,  64 }},
+    {{  64, -64,  64, -64, 64, -64, 64, -64 }},
+    {{  83,  36,  83,  36, 83,  36, 83,  36 }},
+    {{  36, -83,  36, -83, 36, -83, 36, -83 }}
+};
+
+DECLARE_ALIGNED(16, static const int16_t, transform16x16_1[4][8][8] )=
+{
+    {/*1-3*/ /*2-6*/
+        { 90,  87,  90,  87,  90,  87,  90,  87 },
+        { 87,  57,  87,  57,  87,  57,  87,  57 },
+        { 80,   9,  80,   9,  80,   9,  80,   9 },
+        { 70, -43,  70, -43,  70, -43,  70, -43 },
+        { 57, -80,  57, -80,  57, -80,  57, -80 },
+        { 43, -90,  43, -90,  43, -90,  43, -90 },
+        { 25, -70,  25, -70,  25, -70,  25, -70 },
+        { 9,  -25,   9, -25,   9, -25,   9, -25 },
+    },{ /*5-7*/ /*10-14*/
+        {  80,  70,  80,  70,  80,  70,  80,  70 },
+        {   9, -43,   9, -43,   9, -43,   9, -43 },
+        { -70, -87, -70, -87, -70, -87, -70, -87 },
+        { -87,   9, -87,   9, -87,   9, -87,   9 },
+        { -25,  90, -25,  90, -25,  90, -25,  90 },
+        {  57,  25,  57,  25,  57,  25,  57,  25 },
+        {  90, -80,  90, -80,  90, -80,  90, -80 },
+        {  43, -57,  43, -57,  43, -57,  43, -57 },
+    },{ /*9-11*/ /*18-22*/
+        {  57,  43,  57,  43,  57,  43,  57,  43 },
+        { -80, -90, -80, -90, -80, -90, -80, -90 },
+        { -25,  57, -25,  57, -25,  57, -25,  57 },
+        {  90,  25,  90,  25,  90,  25,  90,  25 },
+        {  -9,  -87, -9,  -87, -9,  -87, -9, -87 },
+        { -87,  70, -87,  70, -87,  70, -87,  70 },
+        {  43,   9,  43,   9,  43,   9,  43,   9 },
+        {  70, -80,  70, -80,  70, -80,  70, -80 },
+    },{/*13-15*/ /*  26-30   */
+        {  25,   9,  25,   9,  25,   9,  25,   9 },
+        { -70, -25, -70, -25, -70, -25, -70, -25 },
+        {  90,  43,  90,  43,  90,  43,  90,  43 },
+        { -80, -57, -80, -57, -80, -57, -80, -57 },
+        {  43,  70,  43,  70,  43,  70,  43,  70 },
+        {  9,  -80,   9, -80,   9, -80,   9, -80 },
+        { -57,  87, -57,  87, -57,  87, -57,  87 },
+        {  87, -90,  87, -90,  87, -90,  87, -90 },
+    }
+};
+
+DECLARE_ALIGNED(16, static const int16_t, transform32x32[8][16][8] )=
+{
+    { /*   1-3     */
+        { 90,  90, 90,  90, 90,  90, 90,  90 },
+        { 90,  82, 90,  82, 90,  82, 90,  82 },
+        { 88,  67, 88,  67, 88,  67, 88,  67 },
+        { 85,  46, 85,  46, 85,  46, 85,  46 },
+        { 82,  22, 82,  22, 82,  22, 82,  22 },
+        { 78,  -4, 78,  -4, 78,  -4, 78,  -4 },
+        { 73, -31, 73, -31, 73, -31, 73, -31 },
+        { 67, -54, 67, -54, 67, -54, 67, -54 },
+        { 61, -73, 61, -73, 61, -73, 61, -73 },
+        { 54, -85, 54, -85, 54, -85, 54, -85 },
+        { 46, -90, 46, -90, 46, -90, 46, -90 },
+        { 38, -88, 38, -88, 38, -88, 38, -88 },
+        { 31, -78, 31, -78, 31, -78, 31, -78 },
+        { 22, -61, 22, -61, 22, -61, 22, -61 },
+        { 13, -38, 13, -38, 13, -38, 13, -38 },
+        { 4,  -13,  4, -13,  4, -13,  4, -13 },
+    },{/*  5-7 */
+        {  88,  85,  88,  85,  88,  85,  88,  85 },
+        {  67,  46,  67,  46,  67,  46,  67,  46 },
+        {  31, -13,  31, -13,  31, -13,  31, -13 },
+        { -13, -67, -13, -67, -13, -67, -13, -67 },
+        { -54, -90, -54, -90, -54, -90, -54, -90 },
+        { -82, -73, -82, -73, -82, -73, -82, -73 },
+        { -90, -22, -90, -22, -90, -22, -90, -22 },
+        { -78,  38, -78,  38, -78,  38, -78,  38 },
+        { -46,  82, -46,  82, -46,  82, -46,  82 },
+        {  -4,  88,  -4,  88,  -4,  88,  -4,  88 },
+        {  38,  54,  38,  54,  38,  54,  38,  54 },
+        {  73,  -4,  73,  -4,  73,  -4,  73,  -4 },
+        {  90, -61,  90, -61,  90, -61,  90, -61 },
+        {  85, -90,  85, -90,  85, -90,  85, -90 },
+        {  61, -78,  61, -78,  61, -78,  61, -78 },
+        {  22, -31,  22, -31,  22, -31,  22, -31 },
+    },{/*  9-11   */
+        {  82,  78,  82,  78,  82,  78,  82,  78 },
+        {  22,  -4,  22,  -4,  22,  -4,  22,  -4 },
+        { -54, -82, -54, -82, -54, -82, -54, -82 },
+        { -90, -73, -90, -73, -90, -73, -90, -73 },
+        { -61,  13, -61,  13, -61,  13, -61,  13 },
+        {  13,  85,  13,  85,  13,  85,  13,  85 },
+        {  78,  67,  78,  67,  78,  67,  78,  67 },
+        {  85, -22,  85, -22,  85, -22,  85, -22 },
+        {  31, -88,  31, -88,  31, -88,  31, -88 },
+        { -46, -61, -46, -61, -46, -61, -46, -61 },
+        { -90,  31, -90,  31, -90,  31, -90,  31 },
+        { -67,  90, -67,  90, -67,  90, -67,  90 },
+        {   4,  54,   4,  54,   4,  54,   4,  54 },
+        {  73, -38,  73, -38,  73, -38,  73, -38 },
+        {  88, -90,  88, -90,  88, -90,  88, -90 },
+        {  38, -46,  38, -46,  38, -46,  38, -46 },
+    },{/*  13-15   */
+        {  73,  67,  73,  67,  73,  67,  73,  67 },
+        { -31, -54, -31, -54, -31, -54, -31, -54 },
+        { -90, -78, -90, -78, -90, -78, -90, -78 },
+        { -22,  38, -22,  38, -22,  38, -22,  38 },
+        {  78,  85,  78,  85,  78,  85,  78,  85 },
+        {  67, -22,  67, -22,  67, -22,  67, -22 },
+        { -38, -90, -38, -90, -38, -90, -38, -90 },
+        { -90,   4, -90,   4, -90,   4, -90,   4 },
+        { -13,  90, -13,  90, -13,  90, -13,  90 },
+        {  82,  13,  82,  13,  82,  13,  82,  13 },
+        {  61, -88,  61, -88,  61, -88,  61, -88 },
+        { -46, -31, -46, -31, -46, -31, -46, -31 },
+        { -88,  82, -88,  82, -88,  82, -88,  82 },
+        { -4,   46, -4,   46, -4,   46, -4,   46 },
+        {  85, -73,  85, -73,  85, -73,  85, -73 },
+        {  54, -61,  54, -61,  54, -61,  54, -61 },
+    },{/*  17-19   */
+        {  61,  54,  61,  54,  61,  54,  61,  54 },
+        { -73, -85, -73, -85, -73, -85, -73, -85 },
+        { -46,  -4, -46,  -4, -46,  -4, -46,  -4 },
+        {  82,  88,  82,  88,  82,  88,  82,  88 },
+        {  31, -46,  31, -46,  31, -46,  31, -46 },
+        { -88, -61, -88, -61, -88, -61, -88, -61 },
+        { -13,  82, -13,  82, -13,  82, -13,  82 },
+        {  90,  13,  90,  13,  90,  13,  90,  13 },
+        { -4, -90,  -4, -90,  -4, -90,  -4, -90 },
+        { -90,  38, -90,  38, -90,  38, -90,  38 },
+        {  22,  67,  22,  67,  22,  67,  22,  67 },
+        {  85, -78,  85, -78,  85, -78,  85, -78 },
+        { -38, -22, -38, -22, -38, -22, -38, -22 },
+        { -78,  90, -78,  90, -78,  90, -78,  90 },
+        {  54, -31,  54, -31,  54, -31,  54, -31 },
+        {  67, -73,  67, -73,  67, -73,  67, -73 },
+    },{ /*  21-23   */
+        {  46,  38,  46,  38,  46,  38,  46,  38 },
+        { -90, -88, -90, -88, -90, -88, -90, -88 },
+        {  38,  73,  38,  73,  38,  73,  38,  73 },
+        {  54,  -4,  54,  -4,  54,  -4,  54,  -4 },
+        { -90, -67, -90, -67, -90, -67, -90, -67 },
+        {  31,  90,  31,  90,  31,  90,  31,  90 },
+        {  61, -46,  61, -46,  61, -46,  61, -46 },
+        { -88, -31, -88, -31, -88, -31, -88, -31 },
+        {  22,  85,  22,  85,  22,  85,  22,  85 },
+        {  67, -78,  67, -78,  67, -78,  67, -78 },
+        { -85,  13, -85,  13, -85,  13, -85,  13 },
+        {  13,  61,  13,  61,  13,  61,  13,  61 },
+        {  73, -90,  73, -90,  73, -90,  73, -90 },
+        { -82,  54, -82,  54, -82,  54, -82,  54 },
+        {   4,  22,   4,  22,   4,  22,   4,  22 },
+        {  78, -82,  78, -82,  78, -82,  78, -82 },
+    },{ /*  25-27   */
+        {  31,  22,  31,  22,  31,  22,  31,  22 },
+        { -78, -61, -78, -61, -78, -61, -78, -61 },
+        {  90,  85,  90,  85,  90,  85,  90,  85 },
+        { -61, -90, -61, -90, -61, -90, -61, -90 },
+        {   4,  73,   4,  73,   4,  73,   4,  73 },
+        {  54, -38,  54, -38,  54, -38,  54, -38 },
+        { -88,  -4, -88,  -4, -88,  -4, -88,  -4 },
+        {  82,  46,  82,  46,  82,  46,  82,  46 },
+        { -38, -78, -38, -78, -38, -78, -38, -78 },
+        { -22,  90, -22,  90, -22,  90, -22,  90 },
+        {  73, -82,  73, -82,  73, -82,  73, -82 },
+        { -90,  54, -90,  54, -90,  54, -90,  54 },
+        {  67, -13,  67, -13,  67, -13,  67, -13 },
+        { -13, -31, -13, -31, -13, -31, -13, -31 },
+        { -46,  67, -46,  67, -46,  67, -46,  67 },
+        {  85, -88,  85, -88,  85, -88,  85, -88 },
+    },{/*  29-31   */
+        {  13,   4,  13,   4,  13,   4,  13,   4 },
+        { -38, -13, -38, -13, -38, -13, -38, -13 },
+        {  61,  22,  61,  22,  61,  22,  61,  22 },
+        { -78, -31, -78, -31, -78, -31, -78, -31 },
+        {  88,  38,  88,  38,  88,  38,  88,  38 },
+        { -90, -46, -90, -46, -90, -46, -90, -46 },
+        {  85,  54,  85,  54,  85,  54,  85,  54 },
+        { -73, -61, -73, -61, -73, -61, -73, -61 },
+        {  54,  67,  54,  67,  54,  67,  54,  67 },
+        { -31, -73, -31, -73, -31, -73, -31, -73 },
+        {   4,  78,   4,  78,   4,  78,   4,  78 },
+        {  22, -82,  22, -82,  22, -82,  22, -82 },
+        { -46,  85, -46,  85, -46,  85, -46,  85 },
+        {  67, -88,  67, -88,  67, -88,  67, -88 },
+        { -82,  90, -82,  90, -82,  90, -82,  90 },
+        {  90, -90,  90, -90,  90, -90,  90, -90 },
+    }
+};
+
+#define shift_1st 7
+#define add_1st (1 << (shift_1st - 1))
+
+#define CLIP_PIXEL_MAX_10 0x03FF
+#define CLIP_PIXEL_MAX_12 0x0FFF
+
+#if HAVE_SSE2
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define INIT_8()                                                               \
+    uint8_t *dst = (uint8_t*) _dst;                                            \
+    ptrdiff_t stride = _stride
+#define INIT_10()                                                              \
+    uint16_t *dst = (uint16_t*) _dst;                                          \
+    ptrdiff_t stride = _stride>>1
+
+#define INIT_12() INIT_10()
+#define INIT8_12() INIT8_10()
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define LOAD_EMPTY(dst, src)
+#define LOAD4x4(dst, src)                                                      \
+    dst ## 0 = _mm_load_si128((__m128i *) &src[0]);                           \
+    dst ## 1 = _mm_load_si128((__m128i *) &src[8])
+#define LOAD4x4_STEP(dst, src, sstep)                                          \
+    tmp0 = _mm_loadl_epi64((__m128i *) &src[0 * sstep]);                       \
+    tmp1 = _mm_loadl_epi64((__m128i *) &src[1 * sstep]);                       \
+    tmp2 = _mm_loadl_epi64((__m128i *) &src[2 * sstep]);                       \
+    tmp3 = _mm_loadl_epi64((__m128i *) &src[3 * sstep]);                       \
+    dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp2);                                 \
+    dst ## 1 = _mm_unpacklo_epi16(tmp1, tmp3)
+#define LOAD8x8_E(dst, src, sstep)                                             \
+    dst ## 0 = _mm_load_si128((__m128i *) &src[0 * sstep]);                   \
+    dst ## 1 = _mm_load_si128((__m128i *) &src[1 * sstep]);                   \
+    dst ## 2 = _mm_load_si128((__m128i *) &src[2 * sstep]);                   \
+    dst ## 3 = _mm_load_si128((__m128i *) &src[3 * sstep])
+#define LOAD8x8_O(dst, src, sstep)                                             \
+    tmp0 = _mm_load_si128((__m128i *) &src[1 * sstep]);                       \
+    tmp1 = _mm_load_si128((__m128i *) &src[3 * sstep]);                       \
+    tmp2 = _mm_load_si128((__m128i *) &src[5 * sstep]);                       \
+    tmp3 = _mm_load_si128((__m128i *) &src[7 * sstep]);                       \
+    dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp1);                                 \
+    dst ## 1 = _mm_unpackhi_epi16(tmp0, tmp1);                                 \
+    dst ## 2 = _mm_unpacklo_epi16(tmp2, tmp3);                                 \
+    dst ## 3 = _mm_unpackhi_epi16(tmp2, tmp3)
+#define LOAD16x16_O(dst, src, sstep)                                           \
+    LOAD8x8_O(dst, src, sstep);                                                \
+    tmp0 = _mm_load_si128((__m128i *) &src[ 9 * sstep]);                      \
+    tmp1 = _mm_load_si128((__m128i *) &src[11 * sstep]);                      \
+    tmp2 = _mm_load_si128((__m128i *) &src[13 * sstep]);                      \
+    tmp3 = _mm_load_si128((__m128i *) &src[15 * sstep]);                      \
+    dst ## 4 = _mm_unpacklo_epi16(tmp0, tmp1);                                 \
+    dst ## 5 = _mm_unpackhi_epi16(tmp0, tmp1);                                 \
+    dst ## 6 = _mm_unpacklo_epi16(tmp2, tmp3);                                 \
+    dst ## 7 = _mm_unpackhi_epi16(tmp2, tmp3)
+
+#define LOAD_8x32(dst, dst_stride, src0, src1, idx)                            \
+    src0 = _mm_load_si128((__m128i *) &dst[idx*dst_stride]);                   \
+    src1 = _mm_load_si128((__m128i *) &dst[idx*dst_stride+4])
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define ASSIGN_EMPTY(dst, dst_stride, src)
+#define SAVE_8x16(dst, dst_stride, src)                                        \
+    _mm_store_si128((__m128i *) dst, src);                                    \
+    dst += dst_stride
+#define SAVE_8x32(dst, dst_stride, src0, src1, idx)                            \
+    _mm_store_si128((__m128i *) &dst[idx*dst_stride]  , src0);                \
+    _mm_store_si128((__m128i *) &dst[idx*dst_stride+4], src1)
+
+#define ASSIGN2(dst, dst_stride, src0, src1, assign)                           \
+    assign(dst, dst_stride, src0);                                             \
+    assign(dst, dst_stride, _mm_srli_si128(src0, 8));                          \
+    assign(dst, dst_stride, src1);                                             \
+    assign(dst, dst_stride, _mm_srli_si128(src1, 8))
+#define ASSIGN4(dst, dst_stride, src0, src1, src2, src3, assign)               \
+    assign(dst, dst_stride, src0);                                             \
+    assign(dst, dst_stride, src1);                                             \
+    assign(dst, dst_stride, src2);                                             \
+    assign(dst, dst_stride, src3)
+#define ASSIGN4_LO(dst, dst_stride, src, assign)                               \
+    ASSIGN4(dst, dst_stride, src ## 0, src ## 1, src ## 2, src ## 3, assign)
+#define ASSIGN4_HI(dst, dst_stride, src, assign)                               \
+    ASSIGN4(dst, dst_stride, src ## 4, src ## 5, src ## 6, src ## 7, assign)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define TRANSPOSE4X4_16(dst)                                                   \
+    tmp0 = _mm_unpacklo_epi16(dst ## 0, dst ## 1);                             \
+    tmp1 = _mm_unpackhi_epi16(dst ## 0, dst ## 1);                             \
+    dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp1);                                 \
+    dst ## 1 = _mm_unpackhi_epi16(tmp0, tmp1)
+#define TRANSPOSE4X4_16_S(dst, dst_stride, src, assign)                        \
+    TRANSPOSE4X4_16(src);                                                      \
+    ASSIGN2(dst, dst_stride, src ## 0, src ## 1, assign)
+
+#define TRANSPOSE8X8_16(dst)                                                   \
+    tmp0 = _mm_unpacklo_epi16(dst ## 0, dst ## 1);                             \
+    tmp1 = _mm_unpacklo_epi16(dst ## 2, dst ## 3);                             \
+    tmp2 = _mm_unpacklo_epi16(dst ## 4, dst ## 5);                             \
+    tmp3 = _mm_unpacklo_epi16(dst ## 6, dst ## 7);                             \
+    src0 = _mm_unpacklo_epi32(tmp0, tmp1);                                     \
+    src1 = _mm_unpacklo_epi32(tmp2, tmp3);                                     \
+    src2 = _mm_unpackhi_epi32(tmp0, tmp1);                                     \
+    src3 = _mm_unpackhi_epi32(tmp2, tmp3);                                     \
+    tmp0 = _mm_unpackhi_epi16(dst ## 0, dst ## 1);                             \
+    tmp1 = _mm_unpackhi_epi16(dst ## 2, dst ## 3);                             \
+    tmp2 = _mm_unpackhi_epi16(dst ## 4, dst ## 5);                             \
+    tmp3 = _mm_unpackhi_epi16(dst ## 6, dst ## 7);                             \
+    dst ## 0 = _mm_unpacklo_epi64(src0 , src1);                                \
+    dst ## 1 = _mm_unpackhi_epi64(src0 , src1);                                \
+    dst ## 2 = _mm_unpacklo_epi64(src2 , src3);                                \
+    dst ## 3 = _mm_unpackhi_epi64(src2 , src3);                                \
+    src0 = _mm_unpacklo_epi32(tmp0, tmp1);                                     \
+    src1 = _mm_unpacklo_epi32(tmp2, tmp3);                                     \
+    src2 = _mm_unpackhi_epi32(tmp0, tmp1);                                     \
+    src3 = _mm_unpackhi_epi32(tmp2, tmp3);                                     \
+    dst ## 4 = _mm_unpacklo_epi64(src0 , src1);                                \
+    dst ## 5 = _mm_unpackhi_epi64(src0 , src1);                                \
+    dst ## 6 = _mm_unpacklo_epi64(src2 , src3);                                \
+    dst ## 7 = _mm_unpackhi_epi64(src2 , src3)
+#define TRANSPOSE8x8_16_S(out, sstep_out, src, assign)                         \
+    TRANSPOSE8X8_16(src);                                                      \
+    p_dst = out;                                                               \
+    ASSIGN4_LO(p_dst, sstep_out, src, assign);                                 \
+    ASSIGN4_HI(p_dst, sstep_out, src, assign)
+#define TRANSPOSE8x8_16_LS(out, sstep_out, in, sstep_in, assign)               \
+    e0  = _mm_load_si128((__m128i *) &in[0*sstep_in]);                         \
+    e1  = _mm_load_si128((__m128i *) &in[1*sstep_in]);                         \
+    e2  = _mm_load_si128((__m128i *) &in[2*sstep_in]);                         \
+    e3  = _mm_load_si128((__m128i *) &in[3*sstep_in]);                         \
+    e4  = _mm_load_si128((__m128i *) &in[4*sstep_in]);                         \
+    e5  = _mm_load_si128((__m128i *) &in[5*sstep_in]);                         \
+    e6  = _mm_load_si128((__m128i *) &in[6*sstep_in]);                         \
+    e7  = _mm_load_si128((__m128i *) &in[7*sstep_in]);                         \
+    TRANSPOSE8x8_16_S(out, sstep_out, e, assign)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define TR_COMPUTE_TRANFORM(dst1, dst2, src0, src1, src2, src3, i, j, transform)\
+    tmp1 = _mm_load_si128((__m128i *) transform[i  ][j]);                      \
+    tmp3 = _mm_load_si128((__m128i *) transform[i+1][j]);                      \
+    tmp0 = _mm_madd_epi16(src0, tmp1);                                         \
+    tmp1 = _mm_madd_epi16(src1, tmp1);                                         \
+    tmp2 = _mm_madd_epi16(src2, tmp3);                                         \
+    tmp3 = _mm_madd_epi16(src3, tmp3);                                         \
+    dst1 = _mm_add_epi32(tmp0, tmp2);                                          \
+    dst2 = _mm_add_epi32(tmp1, tmp3)
+
+#define SCALE8x8_2x32(dst0, src0, src1)                                        \
+    src0 = _mm_srai_epi32(src0, shift);                                        \
+    src1 = _mm_srai_epi32(src1, shift);                                        \
+    dst0 = _mm_packs_epi32(src0, src1)
+#define SCALE_4x32(dst0, dst1, src0, src1, src2, src3)                         \
+    SCALE8x8_2x32(dst0, src0, src1);                                           \
+    SCALE8x8_2x32(dst1, src2, src3)
+#define SCALE16x16_2x32(dst, dst_stride, src0, src1, j)                        \
+    e0   = _mm_load_si128((__m128i *) &o16[j*8+0]);                           \
+    e7   = _mm_load_si128((__m128i *) &o16[j*8+4]);                           \
+    tmp4 = _mm_add_epi32(src0, e0);                                            \
+    src0 = _mm_sub_epi32(src0, e0);                                            \
+    e0   = _mm_add_epi32(src1, e7);                                            \
+    src1 = _mm_sub_epi32(src1, e7);                                            \
+    SCALE_4x32(e0, e7, tmp4, e0, src0, src1);                                  \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(             j)]  , e0);     \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)]  , e7)
+
+#define SCALE32x32_2x32(dst, dst_stride, j)                                    \
+    e0   = _mm_load_si128((__m128i *) &e32[j*16+0]);                          \
+    e1   = _mm_load_si128((__m128i *) &e32[j*16+4]);                          \
+    e4   = _mm_load_si128((__m128i *) &o32[j*16+0]);                          \
+    e5   = _mm_load_si128((__m128i *) &o32[j*16+4]);                          \
+    tmp0 = _mm_add_epi32(e0, e4);                                              \
+    tmp1 = _mm_add_epi32(e1, e5);                                              \
+    tmp2 = _mm_sub_epi32(e1, e5);                                              \
+    tmp3 = _mm_sub_epi32(e0, e4);                                              \
+    SCALE_4x32(tmp0, tmp1, tmp0, tmp1, tmp3, tmp2);                            \
+    _mm_store_si128((__m128i *) &dst[dst_stride*i+0]  , tmp0);                \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-i)+0]  , tmp1)
+
+#define SAVE16x16_2x32(dst, dst_stride, src0, src1, j)                        \
+    e0   = _mm_load_si128((__m128i *) &o16[j*8+0]);                           \
+    e7   = _mm_load_si128((__m128i *) &o16[j*8+4]);                           \
+    tmp4 = _mm_add_epi32(src0, e0);                                            \
+    src0 = _mm_sub_epi32(src0, e0);                                            \
+    e0   = _mm_add_epi32(src1, e7);                                            \
+    src1 = _mm_sub_epi32(src1, e7);                                            \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(             j)]  , tmp4);   \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(             j)+4], e0);     \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)]  , src0);   \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)+4], src1)
+
+
+#define SCALE8x8_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx)          \
+    SCALE8x8_2x32(dst0, src0, src1)
+#define SCALE16x16_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx)        \
+    SCALE16x16_2x32(dst, dst_stride, src0, src1, idx)
+#define SAVE16x16_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx)         \
+    SAVE16x16_2x32(dst, dst_stride, src0, src1, idx)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_4x4_luma_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define COMPUTE_LUMA(dst , idx)                                                \
+    tmp0 = _mm_load_si128((__m128i *) (transform4x4_luma[idx  ]));            \
+    tmp1 = _mm_load_si128((__m128i *) (transform4x4_luma[idx+1]));            \
+    tmp0 = _mm_madd_epi16(src0, tmp0);                                         \
+    tmp1 = _mm_madd_epi16(src1, tmp1);                                         \
+    dst  = _mm_add_epi32(tmp0, tmp1);                                          \
+    dst  = _mm_add_epi32(dst, add);                                            \
+    dst  = _mm_srai_epi32(dst, shift)
+#define COMPUTE_LUMA_ALL()                                                     \
+    add  = _mm_set1_epi32(1 << (shift - 1));                                   \
+    src0 = _mm_unpacklo_epi16(tmp0, tmp1);                                     \
+    src1 = _mm_unpackhi_epi16(tmp0, tmp1);                                     \
+    COMPUTE_LUMA(res2 , 0);                                                    \
+    COMPUTE_LUMA(res3 , 2);                                                    \
+    res0 = _mm_packs_epi32(res2, res3);                                        \
+    COMPUTE_LUMA(res2 , 4);                                                    \
+    COMPUTE_LUMA(res3 , 6);                                                    \
+    res1 = _mm_packs_epi32(res2, res3)
+
+#define TRANSFORM_LUMA(D)                                                  \
+void ff_hevc_transform_4x4_luma ## _ ## D ## _sse2(int16_t *_coeffs) {          \
+    uint8_t  shift = 7;                                                        \
+    int16_t *src    = _coeffs;                                                 \
+    int16_t *coeffs = _coeffs;                                                 \
+    __m128i res0, res1, res2, res3;                                            \
+    __m128i tmp0, tmp1, src0, src1, add;                                       \
+    LOAD4x4(tmp, src);                                                         \
+    COMPUTE_LUMA_ALL();                                                        \
+    shift = 20 - D;                                                            \
+    res2  = _mm_unpacklo_epi16(res0, res1);                                    \
+    res3  = _mm_unpackhi_epi16(res0, res1);                                    \
+    tmp0  = _mm_unpacklo_epi16(res2, res3);                                    \
+    tmp1  = _mm_unpackhi_epi16(res2, res3);                                    \
+    COMPUTE_LUMA_ALL();                                                        \
+    TRANSPOSE4X4_16(res);                                                      \
+    _mm_store_si128((__m128i *) coeffs    , res0);                             \
+    _mm_store_si128((__m128i *) (coeffs + 8), res1);                           \
+}
+
+TRANSFORM_LUMA( 8);
+TRANSFORM_LUMA( 10);
+TRANSFORM_LUMA( 12);
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_4x4_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define COMPUTE4x4(dst0, dst1, dst2, dst3)                                     \
+    tmp0 = _mm_load_si128((__m128i *) transform4x4[0]);                        \
+    tmp1 = _mm_load_si128((__m128i *) transform4x4[1]);                        \
+    tmp2 = _mm_load_si128((__m128i *) transform4x4[2]);                        \
+    tmp3 = _mm_load_si128((__m128i *) transform4x4[3]);                        \
+    tmp0 = _mm_madd_epi16(e6, tmp0);                                           \
+    tmp1 = _mm_madd_epi16(e6, tmp1);                                           \
+    tmp2 = _mm_madd_epi16(e7, tmp2);                                           \
+    tmp3 = _mm_madd_epi16(e7, tmp3);                                           \
+    e6   = _mm_set1_epi32(add);                                                \
+    tmp0 = _mm_add_epi32(tmp0, e6);                                            \
+    tmp1 = _mm_add_epi32(tmp1, e6);                                            \
+    dst0 = _mm_add_epi32(tmp0, tmp2);                                          \
+    dst1 = _mm_add_epi32(tmp1, tmp3);                                          \
+    dst2 = _mm_sub_epi32(tmp1, tmp3);                                          \
+    dst3 = _mm_sub_epi32(tmp0, tmp2)
+#define COMPUTE4x4_LO()                                                        \
+    COMPUTE4x4(e0, e1, e2, e3)
+#define COMPUTE4x4_HI(dst)                                                     \
+    COMPUTE4x4(e7, e6, e5, e4)
+
+#define TR_4(dst, dst_stride, in, sstep, load, assign)                         \
+    load(e, in);                                                               \
+    e6 = _mm_unpacklo_epi16(e0, e1);                                           \
+    e7 = _mm_unpackhi_epi16(e0, e1);                                           \
+    COMPUTE4x4_LO();                                                           \
+    SCALE_4x32(e0, e1, e0, e1, e2, e3);                                        \
+    TRANSPOSE4X4_16_S(dst, dst_stride, e, assign)                              \
+
+#define TR_4_1( dst, dst_stride, src)    TR_4( dst, dst_stride, src,  4, LOAD4x4, ASSIGN_EMPTY)
+#define TR_4_2( dst, dst_stride, src, D) TR_4( dst, dst_stride, src,  4, LOAD_EMPTY, ASSIGN_EMPTY)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_8x8_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define TR_4_set8x4(in, sstep)                                                 \
+    LOAD8x8_E(src, in, sstep);                                                 \
+    e6 = _mm_unpacklo_epi16(src0, src2);                                       \
+    e7 = _mm_unpacklo_epi16(src1, src3);                                       \
+    COMPUTE4x4_LO();                                                           \
+    e6 = _mm_unpackhi_epi16(src0, src2);                                       \
+    e7 = _mm_unpackhi_epi16(src1, src3);                                       \
+    COMPUTE4x4_HI()
+
+#define TR_COMPUTE8x8(e0, e1, i)                                               \
+    TR_COMPUTE_TRANFORM(tmp2, tmp3, src0, src1, src2, src3, i, 0, transform8x8);\
+    tmp0 = _mm_add_epi32(e0, tmp2);                                            \
+    tmp1 = _mm_add_epi32(e1, tmp3);                                            \
+    tmp3 = _mm_sub_epi32(e1, tmp3);                                            \
+    tmp2 = _mm_sub_epi32(e0, tmp2)
+
+#define TR_8(dst, dst_stride, in, sstep, assign)                               \
+    TR_4_set8x4(in, 2 * sstep);                                                \
+    LOAD8x8_O(src, in, sstep);                                                 \
+    TR_COMPUTE8x8(e0, e7, 0);                                                  \
+    assign(dst, dst_stride, e0, tmp0, tmp1, 0);                                \
+    assign(dst, dst_stride, e7, tmp2, tmp3, 7);                                \
+    TR_COMPUTE8x8(e1, e6, 2);                                                  \
+    assign(dst, dst_stride, e1, tmp0, tmp1, 1);                                \
+    assign(dst, dst_stride, e6, tmp2, tmp3, 6);                                \
+    TR_COMPUTE8x8(e2, e5, 4);                                                  \
+    assign(dst, dst_stride, e2, tmp0, tmp1, 2);                                \
+    assign(dst, dst_stride, e5, tmp2, tmp3, 5);                                \
+    TR_COMPUTE8x8(e3, e4, 6);                                                  \
+    assign(dst, dst_stride, e3, tmp0, tmp1, 3);                                \
+    assign(dst, dst_stride, e4, tmp2, tmp3, 4);                                \
+
+#define TR_8_1( dst, dst_stride, src)                                         \
+    TR_8( dst, dst_stride, src,  8, SCALE8x8_2x32_WRAPPER);                    \
+    TRANSPOSE8x8_16_S(dst, dst_stride, e, SAVE_8x16)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_XxX_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+
+#define TRANSFORM_4x4(D)                                                       \
+void ff_hevc_transform_4x4_ ## D ## _sse2 (int16_t *_coeffs, int col_limit) {  \
+    int16_t *src    = _coeffs;                                                 \
+    int16_t *coeffs = _coeffs;                                                 \
+    int      shift  = 7;                                                       \
+    int      add    = 1 << (shift - 1);                                        \
+    __m128i tmp0, tmp1, tmp2, tmp3;                                            \
+    __m128i e0, e1, e2, e3, e6, e7;                                            \
+    TR_4_1(p_dst1, 4, src);                                                    \
+    shift   = 20 - D;                                                          \
+    add     = 1 << (shift - 1);                                                \
+    TR_4_2(coeffs, 8, tmp, D);                                                 \
+    _mm_store_si128((__m128i *) coeffs    , e0);                               \
+    _mm_store_si128((__m128i *) (coeffs + 8), e1);                             \
+}
+#define TRANSFORM_8x8(D)                                                       \
+void ff_hevc_transform_8x8_ ## D ## _sse2 (int16_t *coeffs, int col_limit) {    \
+    DECLARE_ALIGNED(16, int16_t, tmp[8*8]);                                    \
+    int16_t *src    = coeffs;                                                  \
+    int16_t *p_dst1 = tmp;                                                     \
+    int16_t *p_dst;                                                            \
+    int      shift  = 7;                                                       \
+    int      add    = 1 << (shift - 1);                                        \
+    __m128i src0, src1, src2, src3;                                            \
+    __m128i tmp0, tmp1, tmp2, tmp3;                                            \
+    __m128i e0, e1, e2, e3, e4, e5, e6, e7;                                    \
+    TR_8_1(p_dst1, 8, src);                                                    \
+    shift   = 20 - D;                                                          \
+    add     = 1 << (shift - 1);                                                \
+    TR_8_1(coeffs, 8, tmp);                                                    \
+}
+
+TRANSFORM_4x4(12)
+TRANSFORM_8x8(12)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_16x16_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define TR_COMPUTE16x16(dst1, dst2,src0, src1, src2, src3, i, j)              \
+    TR_COMPUTE_TRANFORM(dst1, dst2,src0, src1, src2, src3, i, j, transform16x16_1)
+#define TR_COMPUTE16x16_FIRST(j)                                               \
+    TR_COMPUTE16x16(src0, src1, e0, e1, e2, e3, 0, j)
+#define TR_COMPUTE16x16_NEXT(i, j)                                             \
+    TR_COMPUTE16x16(tmp0, tmp1, e4, e5, e6, e7, i, j);                         \
+    src0 = _mm_add_epi32(src0, tmp0);                                          \
+    src1 = _mm_add_epi32(src1, tmp1)
+
+#define TR_16(dst, dst_stride, in, sstep, assign)                              \
+    {                                                                          \
+        int i;                                                                 \
+        int o16[8*8];                                                          \
+        LOAD16x16_O(e, in, sstep);                                             \
+        for (i = 0; i < 8; i++) {                                              \
+            TR_COMPUTE16x16_FIRST(i);                                          \
+            TR_COMPUTE16x16_NEXT(2, i);                                        \
+            SAVE_8x32(o16, 8, src0, src1, i);                                  \
+        }                                                                      \
+        TR_8(dst, dst_stride, in, 2 * sstep, assign);                          \
+    }
+
+#define TR_16_1( dst, dst_stride, src)        TR_16( dst, dst_stride, src,     16, SCALE16x16_2x32_WRAPPER)
+#define TR_16_2( dst, dst_stride, src, sstep) TR_16( dst, dst_stride, src,  sstep, SAVE16x16_2x32_WRAPPER )
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_32x32_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define TR_COMPUTE32x32(dst1, dst2,src0, src1, src2, src3, i, j)              \
+    TR_COMPUTE_TRANFORM(dst1, dst2, src0, src1, src2, src3, i, j, transform32x32)
+#define TR_COMPUTE32x32_FIRST(i, j)                                            \
+    TR_COMPUTE32x32(tmp0, tmp1, e0, e1, e2, e3, i, j);                         \
+    src0 = _mm_add_epi32(src0, tmp0);                                          \
+    src1 = _mm_add_epi32(src1, tmp1)
+#define TR_COMPUTE32x32_NEXT(i, j)                                             \
+    TR_COMPUTE32x32(tmp0, tmp1, e4, e5, e6, e7, i, j);                         \
+    src0 = _mm_add_epi32(src0, tmp0);                                          \
+    src1 = _mm_add_epi32(src1, tmp1)
+
+#define TR_32(dst, dst_stride, in, sstep)                                      \
+    {                                                                          \
+        int i;                                                                 \
+        DECLARE_ALIGNED(16, int, e32[16*16]);                                  \
+        DECLARE_ALIGNED(16, int, o32[16*16]);                                  \
+        LOAD16x16_O(e, in, sstep);                                             \
+        for (i = 0; i < 16; i++) {                                             \
+            src0 = _mm_setzero_si128();                                        \
+            src1 = _mm_setzero_si128();                                        \
+            TR_COMPUTE32x32_FIRST(0, i);                                       \
+            TR_COMPUTE32x32_NEXT(2, i);                                        \
+            SAVE_8x32(o32, 16, src0, src1, i);                                 \
+        }                                                                      \
+        LOAD16x16_O(e, (&in[16*sstep]), sstep);                                \
+        for (i = 0; i < 16; i++) {                                             \
+            LOAD_8x32(o32, 16, src0, src1, i);                                 \
+            TR_COMPUTE32x32_FIRST(4, i);                                       \
+            TR_COMPUTE32x32_NEXT(6, i);                                        \
+            SAVE_8x32(o32, 16, src0, src1, i);                                 \
+        }                                                                      \
+        TR_16_2(e32, 16, in, 2 * sstep);                                       \
+        for (i = 0; i < 16; i++) {                                             \
+            SCALE32x32_2x32(dst, dst_stride, i);                               \
+        }                                                                      \
+    }
+
+#define TR_32_1( dst, dst_stride, src)        TR_32( dst, dst_stride, src, 32)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_XxX_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define TRANSFORM2(H, D)                                                   \
+void ff_hevc_transform_ ## H ## x ## H ## _ ## D ## _sse2 (                \
+    int16_t *coeffs, int col_limit) {                                          \
+    int i, j, k, add;                                                          \
+    int      shift = 7;                                                        \
+    int16_t *src   = coeffs;                                                   \
+    DECLARE_ALIGNED(16, int16_t, tmp[H*H]);                                    \
+    DECLARE_ALIGNED(16, int16_t, tmp_2[H*H]);                                  \
+    int16_t *p_dst, *p_tra = tmp_2;                                            \
+    __m128i src0, src1, src2, src3;                                            \
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4;                                      \
+    __m128i e0, e1, e2, e3, e4, e5, e6, e7;                                    \
+    for (k = 0; k < 2; k++) {                                                  \
+        add   = 1 << (shift - 1);                                              \
+        for (i = 0; i < H; i+=8) {                                             \
+            p_dst = tmp + i;                                                   \
+            TR_ ## H ## _1(p_dst, H, src);                                     \
+            src   += 8;                                                        \
+            for (j = 0; j < H; j+=8) {                                         \
+               TRANSPOSE8x8_16_LS((&p_tra[i*H+j]), H, (&tmp[j*H+i]), H, SAVE_8x16);\
+            }                                                                  \
+        }                                                                      \
+        src   = tmp_2;                                                         \
+        p_tra = coeffs;                                                         \
+        shift = 20 - D;                                                        \
+    }                                                                          \
+}
+
+#if !ARCH_X86_64
+    TRANSFORM2(16,  8);
+    TRANSFORM2(16, 10);
+#endif
+TRANSFORM2(16, 12);
+
+#if !ARCH_X86_64
+    TRANSFORM2(32,  8);
+    TRANSFORM2(32, 10);
+#endif
+TRANSFORM2(32, 12);
+
+#endif
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#endif
--- a/libavcodec/x86/hevc_intra_intrinsic.c
+++ b/libavcodec/x86/hevc_intra_intrinsic.c
@ -0,0 +1,922 @@
+#include "config.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavcodec/get_bits.h"
+#include "libavcodec/hevc.h"
+#include "libavcodec/x86/hevcpred.h"
+
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#endif
+
+#if HAVE_SSE2
+#include <emmintrin.h>
+#endif
+#if HAVE_SSSE3
+#include <tmmintrin.h>
+#endif
+#if HAVE_SSE4
+#include <smmintrin.h>
+#endif
+
+#if HAVE_SSE4
+#define _MM_PACKUS_EPI32 _mm_packus_epi32
+#else
+static av_always_inline __m128i _MM_PACKUS_EPI32( __m128i a, __m128i b )
+{
+     a = _mm_slli_epi32 (a, 16);
+     a = _mm_srai_epi32 (a, 16);
+     b = _mm_slli_epi32 (b, 16);
+     b = _mm_srai_epi32 (b, 16);
+     a = _mm_packs_epi32 (a, b);
+    return a;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#if HAVE_SSE4
+#define PLANAR_INIT_8()                                                        \
+    uint8_t *src = (uint8_t*)_src;                                             \
+    const uint8_t *top = (const uint8_t*)_top;                                 \
+    const uint8_t *left = (const uint8_t*)_left
+#define PLANAR_INIT_10()                                                       \
+    uint16_t *src = (uint16_t*)_src;                                           \
+    const uint16_t *top = (const uint16_t*)_top;                               \
+    const uint16_t *left = (const uint16_t*)_left
+
+#define PLANAR_COMPUTE(val, shift)                                             \
+    add = _mm_mullo_epi16(_mm_set1_epi16(1+y), l0);                            \
+    ly1 = _mm_unpacklo_epi16(ly , ly );                                        \
+    ly1 = _mm_unpacklo_epi32(ly1, ly1);                                        \
+    ly1 = _mm_unpacklo_epi64(ly1, ly1);                                        \
+    c0  = _mm_mullo_epi16(tmp1, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set1_epi16(val - y), tx);                        \
+    c0  = _mm_add_epi16(c0, c1);                                               \
+    x0  = _mm_add_epi16(x0, c0);                                               \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    c0  = _mm_srli_epi16(x0, shift)
+
+#define PLANAR_COMPUTE_HI(val, shift)                                          \
+    C0  = _mm_mullo_epi16(tmp2, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set1_epi16(val - y), th);                        \
+    C0  = _mm_add_epi16(C0, C1);                                               \
+    x0  = _mm_add_epi16(x0, C0);                                               \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    C0  = _mm_srli_epi16(x0, shift)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PLANAR_LOAD_0_8()                                                      \
+    ly   = _mm_loadl_epi64((__m128i*) left);                                   \
+    tx   = _mm_loadl_epi64((__m128i*) top);                                    \
+    ly   = _mm_unpacklo_epi8(ly, _mm_setzero_si128());                         \
+    tx   = _mm_unpacklo_epi8(tx, _mm_setzero_si128());                         \
+    ly   = _mm_unpacklo_epi16(ly, ly);                                         \
+    tx   = _mm_unpacklo_epi64(tx, tx)
+#define PLANAR_LOAD_0_10()                                                     \
+    ly   = _mm_loadl_epi64((__m128i*) left);                                   \
+    tx   = _mm_loadl_epi64((__m128i*) top);                                    \
+    ly   = _mm_unpacklo_epi16(ly, ly);                                         \
+    tx   = _mm_unpacklo_epi64(tx, tx)
+
+#define PLANAR_COMPUTE_0(dst , v1, v2, v3, v4)                                 \
+    dst = _mm_mullo_epi16(tmp1, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set_epi16(v1,v1,v1,v1,v2,v2,v2,v2), tx);         \
+    add = _mm_mullo_epi16(_mm_set_epi16(v3,v3,v3,v3,v4,v4,v4,v4), l0);         \
+    dst = _mm_add_epi16(dst, c1);                                              \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    dst = _mm_add_epi16(dst, x0);                                              \
+    dst = _mm_srli_epi16(dst, 3)
+
+#define PLANAR_STORE_0_8()                                                     \
+    c0  = _mm_packus_epi16(c0,C0);                                             \
+    *((uint32_t *) src              ) = _mm_cvtsi128_si32(c0   );              \
+    *((uint32_t *)(src +     stride)) = _mm_extract_epi32(c0, 1);              \
+    *((uint32_t *)(src + 2 * stride)) = _mm_extract_epi32(c0, 2);              \
+    *((uint32_t *)(src + 3 * stride)) = _mm_extract_epi32(c0, 3)
+#define PLANAR_STORE_0_10()                                                    \
+    _mm_storel_epi64((__m128i*)(src             ), c0);                        \
+    _mm_storel_epi64((__m128i*)(src +     stride), _mm_unpackhi_epi64(c0, c0));\
+    _mm_storel_epi64((__m128i*)(src + 2 * stride), C0);                        \
+    _mm_storel_epi64((__m128i*)(src + 3 * stride), _mm_unpackhi_epi64(C0, C0))
+
+#define PRED_PLANAR_0(D)                                                       \
+void pred_planar_0_ ## D ## _sse(uint8_t *_src, const uint8_t *_top,           \
+        const uint8_t *_left, ptrdiff_t stride) {                              \
+    __m128i ly, l0, tx, ly1;                                                   \
+    __m128i tmp1, add, x0, c0, c1, C0;                                         \
+    PLANAR_INIT_ ## D();                                                       \
+    tx   = _mm_set1_epi16(top[4]);                                             \
+    l0   = _mm_set1_epi16(left[4]);                                            \
+    add  = _mm_set1_epi16(4);                                                  \
+    tmp1 = _mm_set_epi16(0,1,2,3,0,1,2,3);                                     \
+    c1   = _mm_mullo_epi16(_mm_set_epi16(4,3,2,1,4,3,2,1), tx);                \
+    c1   = _mm_add_epi16(c1, add);                                             \
+    PLANAR_LOAD_0_ ##D();                                                      \
+                                                                               \
+    ly1 = _mm_unpacklo_epi32(ly, ly);                                          \
+    PLANAR_COMPUTE_0(c0, 2, 3, 2, 1);                                          \
+    ly1 = _mm_unpackhi_epi32(ly, ly);                                          \
+    PLANAR_COMPUTE_0(C0, 0, 1, 4, 3);                                          \
+    PLANAR_STORE_0_ ## D();                                                    \
+}
+PRED_PLANAR_0( 8)
+PRED_PLANAR_0(10)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PLANAR_LOAD_1_8()                                                      \
+    ly   = _mm_loadl_epi64((__m128i*)left);                                    \
+    tx   = _mm_loadl_epi64((__m128i*)top);                                     \
+    ly   = _mm_unpacklo_epi8(ly,_mm_setzero_si128());                          \
+    tx   = _mm_unpacklo_epi8(tx,_mm_setzero_si128())
+#define PLANAR_LOAD_1_10()                                                     \
+    ly   = _mm_loadu_si128((__m128i*)left);                                    \
+    tx   = _mm_loadu_si128((__m128i*)top)
+
+#define PLANAR_COMPUTE_1()                                                     \
+    PLANAR_COMPUTE(7, 4)
+
+#define PLANAR_STORE_1_8()                                                     \
+    c0  = _mm_packus_epi16(c0,_mm_setzero_si128());                            \
+    _mm_storel_epi64((__m128i*)(src), c0);                                     \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly,2)
+#define PLANAR_STORE_1_10()                                                    \
+    _mm_storeu_si128((__m128i*)(src), c0);                                     \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly,2)
+
+#define PRED_PLANAR_1(D)                                                       \
+void pred_planar_1_ ## D ## _sse(uint8_t *_src, const uint8_t *_top,           \
+        const uint8_t *_left, ptrdiff_t stride) {                              \
+    int y;                                                                     \
+    __m128i ly, l0, tx, ly1;                                                   \
+    __m128i tmp1, add, x0, c0, c1;                                             \
+    PLANAR_INIT_ ## D();                                                       \
+    tx   = _mm_set1_epi16(top[8]);                                             \
+    l0   = _mm_set1_epi16(left[8]);                                            \
+    add  = _mm_set1_epi16(8);                                                  \
+    tmp1 = _mm_set_epi16(0,1,2,3,4,5,6,7);                                     \
+    c1   = _mm_mullo_epi16(_mm_set_epi16(8,7,6,5,4,3,2,1), tx);                \
+    c1   = _mm_add_epi16(c1,add);                                              \
+    PLANAR_LOAD_1_ ## D();                                                     \
+    for (y = 0; y < 8; y++) {                                                  \
+        PLANAR_COMPUTE_1();                                                    \
+        PLANAR_STORE_1_ ## D();                                                \
+    }                                                                          \
+}
+
+PRED_PLANAR_1( 8)
+PRED_PLANAR_1(10)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PLANAR_LOAD_2_8()                                                      \
+    ly   = _mm_loadu_si128((__m128i*) left);                                   \
+    tx   = _mm_loadu_si128((__m128i*) top);                                    \
+    lh   = _mm_unpackhi_epi8(ly,_mm_setzero_si128());                          \
+    ly   = _mm_unpacklo_epi8(ly,_mm_setzero_si128());                          \
+    th   = _mm_unpackhi_epi8(tx,_mm_setzero_si128());                          \
+    tx   = _mm_unpacklo_epi8(tx,_mm_setzero_si128())
+
+#define PLANAR_LOAD_2_10()                                                     \
+    ly   = _mm_loadu_si128((__m128i*) left);                                   \
+    lh   = _mm_loadu_si128((__m128i*)&left[8]);                                \
+    tx   = _mm_loadu_si128((__m128i*) top);                                    \
+    th   = _mm_loadu_si128((__m128i*)&top[8])
+
+#define PLANAR_COMPUTE_2()                                                     \
+    PLANAR_COMPUTE(15, 5)
+#define PLANAR_COMPUTE_HI_2()                                                  \
+    PLANAR_COMPUTE_HI(15, 5)
+
+#define PLANAR_STORE_2_8()                                                     \
+    c0  = _mm_packus_epi16(c0, C0);                                            \
+    _mm_storeu_si128((__m128i*) src, c0);                                      \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly,2)
+#define PLANAR_STORE_2_10()                                                    \
+    _mm_storeu_si128((__m128i*) src   , c0);                                   \
+    _mm_storeu_si128((__m128i*)&src[8], C0);                                   \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly,2)
+
+#define PRED_PLANAR_2(D)                                                       \
+void pred_planar_2_ ## D ## _sse(uint8_t *_src, const uint8_t *_top,           \
+        const uint8_t *_left, ptrdiff_t stride) {                              \
+    int y, i;                                                                  \
+    __m128i ly, lh, l0, tx, th, ly1;                                           \
+    __m128i tmp1, tmp2, add, x0, c0, c1, C0, C1;                               \
+    PLANAR_INIT_ ## D();                                                       \
+    tx   = _mm_set1_epi16(top[16]);                                            \
+    l0   = _mm_set1_epi16(left[16]);                                           \
+    add  = _mm_set1_epi16(16);                                                 \
+    tmp1 = _mm_set_epi16( 8, 9,10,11,12,13,14,15);                             \
+    tmp2 = _mm_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7);                             \
+    c1   = _mm_mullo_epi16(_mm_set_epi16( 8, 7, 6, 5, 4, 3, 2, 1), tx);        \
+    C1   = _mm_mullo_epi16(_mm_set_epi16(16,15,14,13,12,11,10, 9), tx);        \
+    c1   = _mm_add_epi16(c1, add);                                             \
+    C1   = _mm_add_epi16(C1, add);                                             \
+    PLANAR_LOAD_2_ ## D();                                                     \
+    for (i = 0; i < 2; i++) {                                                  \
+        for (y = i*8; y < i*8+8; y++) {                                        \
+            PLANAR_COMPUTE_2();                                                \
+            PLANAR_COMPUTE_HI_2();                                             \
+            PLANAR_STORE_2_ ## D();                                            \
+        }                                                                      \
+        ly = lh;                                                               \
+    }                                                                          \
+}
+
+PRED_PLANAR_2( 8)
+PRED_PLANAR_2(10)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PLANAR_LOAD_3_8()                                                      \
+    ly   = _mm_loadu_si128((__m128i*) left);                                   \
+    lh   = _mm_unpackhi_epi8(ly,_mm_setzero_si128());                          \
+    ly   = _mm_unpacklo_epi8(ly,_mm_setzero_si128());                          \
+    tx   = _mm_loadu_si128((__m128i*) top);                                    \
+    th   = _mm_unpackhi_epi8(tx,_mm_setzero_si128());                          \
+    tx   = _mm_unpacklo_epi8(tx,_mm_setzero_si128());                          \
+    TX   = _mm_loadu_si128((__m128i*)(top + 16));                              \
+    TH   = _mm_unpackhi_epi8(TX,_mm_setzero_si128());                          \
+    TX   = _mm_unpacklo_epi8(TX,_mm_setzero_si128())
+#define PLANAR_LOAD_3_10()                                                     \
+    ly   = _mm_loadu_si128((__m128i*) left   );                                \
+    lh   = _mm_loadu_si128((__m128i*)&left[8]);                                \
+    tx   = _mm_loadu_si128((__m128i*) top    );                                \
+    th   = _mm_loadu_si128((__m128i*)&top[ 8]);                                \
+    TX   = _mm_loadu_si128((__m128i*)&top[16]);                                \
+    TH   = _mm_loadu_si128((__m128i*)&top[24])
+
+#define PLANAR_RELOAD_3_8()                                                    \
+    ly = _mm_loadu_si128((__m128i*)(left+16));                                 \
+    lh = _mm_unpackhi_epi8(ly,_mm_setzero_si128());                            \
+    ly = _mm_unpacklo_epi8(ly,_mm_setzero_si128())
+#define PLANAR_RELOAD_3_10()                                                   \
+    ly = _mm_loadu_si128((__m128i*)&left[16]);                                 \
+    lh = _mm_loadu_si128((__m128i*)&left[24])
+
+#define PLANAR_COMPUTE_3()                                                     \
+    PLANAR_COMPUTE(31, 6)
+#define PLANAR_COMPUTE_HI_3()                                                  \
+    PLANAR_COMPUTE_HI(31, 6)
+#define PLANAR_COMPUTE_HI2_3()                                                 \
+    c0  = _mm_mullo_epi16(TMP1, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set1_epi16(31 - y), TX);                         \
+    c0  = _mm_add_epi16(c0, c2);                                               \
+    x0  = _mm_add_epi16(x0, c0);                                               \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    c0  = _mm_srli_epi16(x0, 6)
+#define PLANAR_COMPUTE_HI3_3()                                                 \
+    C0  = _mm_mullo_epi16(TMP2, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set1_epi16(31 - y), TH);                         \
+    C0  = _mm_add_epi16(C0, C2);                                               \
+    x0  = _mm_add_epi16(x0, C0);                                               \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    C0  = _mm_srli_epi16(x0, 6)
+
+#define PLANAR_STORE1_3_8()                                                    \
+    c0 = _mm_packus_epi16(c0, C0);                                             \
+    _mm_storeu_si128((__m128i*) src, c0)
+#define PLANAR_STORE2_3_8()                                                    \
+    c0  = _mm_packus_epi16(c0, C0);                                            \
+    _mm_storeu_si128((__m128i*) (src + 16), c0);                               \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly, 2)
+
+#define PLANAR_STORE1_3_10()                                                   \
+    _mm_storeu_si128((__m128i*) src    , c0);                                  \
+    _mm_storeu_si128((__m128i*)&src[ 8], C0)
+#define PLANAR_STORE2_3_10()                                                   \
+    _mm_storeu_si128((__m128i*)&src[16], c0);                                  \
+    _mm_storeu_si128((__m128i*)&src[24], C0);                                  \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly, 2)
+
+
+#define PRED_PLANAR_3(D)                                                       \
+void pred_planar_3_ ## D ## _sse(uint8_t *_src, const uint8_t *_top,           \
+        const uint8_t *_left, ptrdiff_t stride) {                              \
+    int y, i;                                                                  \
+    __m128i l0, ly, lh, ly1, tx, th, TX, TH, tmp1, tmp2, TMP1, TMP2;           \
+    __m128i x0, c0, c1, c2, C0, C1, C2, add;                                   \
+    PLANAR_INIT_ ## D();                                                       \
+    tx   = _mm_set1_epi16(top[32]);                                            \
+    l0   = _mm_set1_epi16(left[32]);                                           \
+    add  = _mm_set1_epi16(32);                                                 \
+    tmp1 = _mm_set_epi16(24,25,26,27,28,29,30,31);                             \
+    tmp2 = _mm_set_epi16(16,17,18,19,20,21,22,23);                             \
+    TMP1 = _mm_set_epi16( 8, 9,10,11,12,13,14,15);                             \
+    TMP2 = _mm_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7);                             \
+    c1   = _mm_mullo_epi16(_mm_set_epi16( 8, 7, 6, 5, 4, 3, 2, 1), tx);        \
+    C1   = _mm_mullo_epi16(_mm_set_epi16(16,15,14,13,12,11,10, 9), tx);        \
+    c2   = _mm_mullo_epi16(_mm_set_epi16(24,23,22,21,20,19,18,17), tx);        \
+    C2   = _mm_mullo_epi16(_mm_set_epi16(32,31,30,29,28,27,26,25), tx);        \
+    c1   = _mm_add_epi16(c1, add);                                             \
+    C1   = _mm_add_epi16(C1, add);                                             \
+    c2   = _mm_add_epi16(c2, add);                                             \
+    C2   = _mm_add_epi16(C2, add);                                             \
+    PLANAR_LOAD_3_ ## D();                                                     \
+    for (i = 0; i < 4; i++) {                                                  \
+        for (y = 0+i*8; y < 8+i*8; y++) {                                      \
+            PLANAR_COMPUTE_3();                                                \
+            PLANAR_COMPUTE_HI_3();                                             \
+            PLANAR_STORE1_3_ ## D();                                           \
+            PLANAR_COMPUTE_HI2_3();                                            \
+            PLANAR_COMPUTE_HI3_3();                                            \
+            PLANAR_STORE2_3_ ## D();                                           \
+        }                                                                      \
+        if (i == 0 || i == 2) {                                                \
+            ly = lh;                                                           \
+        } else {                                                               \
+            PLANAR_RELOAD_3_ ## D();                                           \
+        }                                                                      \
+    }                                                                          \
+}
+
+PRED_PLANAR_3( 8)
+PRED_PLANAR_3(10)
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define STORE8(out, sstep_out)                                                 \
+    _mm_storel_epi64((__m128i*)&out[0*sstep_out], m10);                        \
+    _mm_storel_epi64((__m128i*)&out[2*sstep_out], m12);                        \
+    _mm_storel_epi64((__m128i*)&out[4*sstep_out], m11);                        \
+    _mm_storel_epi64((__m128i*)&out[6*sstep_out], m13);                        \
+    m10 = _mm_unpackhi_epi64(m10, m10);                                        \
+    m12 = _mm_unpackhi_epi64(m12, m12);                                        \
+    m11 = _mm_unpackhi_epi64(m11, m11);                                        \
+    m13 = _mm_unpackhi_epi64(m13, m13);                                        \
+    _mm_storel_epi64((__m128i*)&out[1*sstep_out], m10);                        \
+    _mm_storel_epi64((__m128i*)&out[3*sstep_out], m12);                        \
+    _mm_storel_epi64((__m128i*)&out[5*sstep_out], m11);                        \
+    _mm_storel_epi64((__m128i*)&out[7*sstep_out], m13)
+
+#define STORE16(out, sstep_out)                                                \
+    _mm_storeu_si128((__m128i *) &out[0*sstep_out], m0);                       \
+    _mm_storeu_si128((__m128i *) &out[1*sstep_out], m1);                       \
+    _mm_storeu_si128((__m128i *) &out[2*sstep_out], m2);                       \
+    _mm_storeu_si128((__m128i *) &out[3*sstep_out], m3);                       \
+    _mm_storeu_si128((__m128i *) &out[4*sstep_out], m4);                       \
+    _mm_storeu_si128((__m128i *) &out[5*sstep_out], m5);                       \
+    _mm_storeu_si128((__m128i *) &out[6*sstep_out], m6);                       \
+    _mm_storeu_si128((__m128i *) &out[7*sstep_out], m7)
+
+#define TRANSPOSE4x4_8(in, sstep_in, out, sstep_out)                           \
+    {                                                                          \
+        __m128i m0  = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]);            \
+        __m128i m1  = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]);            \
+        __m128i m2  = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]);            \
+        __m128i m3  = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]);            \
+                                                                               \
+        __m128i m10 = _mm_unpacklo_epi8(m0, m1);                               \
+        __m128i m11 = _mm_unpacklo_epi8(m2, m3);                               \
+                                                                               \
+        m0  = _mm_unpacklo_epi16(m10, m11);                                    \
+                                                                               \
+        *((uint32_t *) (out+0*sstep_out)) =_mm_cvtsi128_si32(m0);              \
+        *((uint32_t *) (out+1*sstep_out)) =_mm_extract_epi32(m0, 1);           \
+        *((uint32_t *) (out+2*sstep_out)) =_mm_extract_epi32(m0, 2);           \
+        *((uint32_t *) (out+3*sstep_out)) =_mm_extract_epi32(m0, 3);           \
+    }
+#define TRANSPOSE8x8_8(in, sstep_in, out, sstep_out)                           \
+    {                                                                          \
+        __m128i m0  = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]);            \
+        __m128i m1  = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]);            \
+        __m128i m2  = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]);            \
+        __m128i m3  = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]);            \
+        __m128i m4  = _mm_loadl_epi64((__m128i *) &in[4*sstep_in]);            \
+        __m128i m5  = _mm_loadl_epi64((__m128i *) &in[5*sstep_in]);            \
+        __m128i m6  = _mm_loadl_epi64((__m128i *) &in[6*sstep_in]);            \
+        __m128i m7  = _mm_loadl_epi64((__m128i *) &in[7*sstep_in]);            \
+                                                                               \
+        __m128i m10 = _mm_unpacklo_epi8(m0, m1);                               \
+        __m128i m11 = _mm_unpacklo_epi8(m2, m3);                               \
+        __m128i m12 = _mm_unpacklo_epi8(m4, m5);                               \
+        __m128i m13 = _mm_unpacklo_epi8(m6, m7);                               \
+                                                                               \
+        m0  = _mm_unpacklo_epi16(m10, m11);                                    \
+        m1  = _mm_unpacklo_epi16(m12, m13);                                    \
+        m2  = _mm_unpackhi_epi16(m10, m11);                                    \
+        m3  = _mm_unpackhi_epi16(m12, m13);                                    \
+                                                                               \
+        m10 = _mm_unpacklo_epi32(m0 , m1 );                                    \
+        m11 = _mm_unpacklo_epi32(m2 , m3 );                                    \
+        m12 = _mm_unpackhi_epi32(m0 , m1 );                                    \
+        m13 = _mm_unpackhi_epi32(m2 , m3 );                                    \
+                                                                               \
+        STORE8(out, sstep_out);                                                \
+    }
+#define TRANSPOSE16x16_8(in, sstep_in, out, sstep_out)                        \
+    for (y = 0; y < sstep_in; y+=8)                                           \
+        for (x = 0; x < sstep_in; x+=8)                                       \
+            TRANSPOSE8x8_8((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out)
+#define TRANSPOSE32x32_8(in, sstep_in, out, sstep_out)                        \
+    for (y = 0; y < sstep_in; y+=8)                                           \
+        for (x = 0; x < sstep_in; x+=8)                                       \
+            TRANSPOSE8x8_8((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define TRANSPOSE4x4_10(in, sstep_in, out, sstep_out)                          \
+    {                                                                          \
+        __m128i m0  = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]);            \
+        __m128i m1  = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]);            \
+        __m128i m2  = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]);            \
+        __m128i m3  = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]);            \
+                                                                               \
+        __m128i m10 = _mm_unpacklo_epi16(m0, m1);                              \
+        __m128i m11 = _mm_unpacklo_epi16(m2, m3);                              \
+                                                                               \
+        m0  = _mm_unpacklo_epi32(m10, m11);                                    \
+        m1  = _mm_unpackhi_epi32(m10, m11);                                    \
+                                                                               \
+        _mm_storel_epi64((__m128i *) (out+0*sstep_out) , m0);                  \
+        _mm_storel_epi64((__m128i *) (out+1*sstep_out) , _mm_unpackhi_epi64(m0, m0));\
+        _mm_storel_epi64((__m128i *) (out+2*sstep_out) , m1);                  \
+        _mm_storel_epi64((__m128i *) (out+3*sstep_out) , _mm_unpackhi_epi64(m1, m1));\
+    }
+#define TRANSPOSE8x8_10(in, sstep_in, out, sstep_out)                          \
+    {                                                                          \
+        __m128i tmp0, tmp1, tmp2, tmp3, src0, src1, src2, src3;                \
+        __m128i m0  = _mm_loadu_si128((__m128i *) &in[0*sstep_in]);            \
+        __m128i m1  = _mm_loadu_si128((__m128i *) &in[1*sstep_in]);            \
+        __m128i m2  = _mm_loadu_si128((__m128i *) &in[2*sstep_in]);            \
+        __m128i m3  = _mm_loadu_si128((__m128i *) &in[3*sstep_in]);            \
+        __m128i m4  = _mm_loadu_si128((__m128i *) &in[4*sstep_in]);            \
+        __m128i m5  = _mm_loadu_si128((__m128i *) &in[5*sstep_in]);            \
+        __m128i m6  = _mm_loadu_si128((__m128i *) &in[6*sstep_in]);            \
+        __m128i m7  = _mm_loadu_si128((__m128i *) &in[7*sstep_in]);            \
+                                                                               \
+        tmp0 = _mm_unpacklo_epi16(m0, m1);                                     \
+        tmp1 = _mm_unpacklo_epi16(m2, m3);                                     \
+        tmp2 = _mm_unpacklo_epi16(m4, m5);                                     \
+        tmp3 = _mm_unpacklo_epi16(m6, m7);                                     \
+        src0 = _mm_unpacklo_epi32(tmp0, tmp1);                                 \
+        src1 = _mm_unpacklo_epi32(tmp2, tmp3);                                 \
+        src2 = _mm_unpackhi_epi32(tmp0, tmp1);                                 \
+        src3 = _mm_unpackhi_epi32(tmp2, tmp3);                                 \
+        tmp0 = _mm_unpackhi_epi16(m0, m1);                                     \
+        tmp1 = _mm_unpackhi_epi16(m2, m3);                                     \
+        tmp2 = _mm_unpackhi_epi16(m4, m5);                                     \
+        tmp3 = _mm_unpackhi_epi16(m6, m7);                                     \
+        m0   = _mm_unpacklo_epi64(src0 , src1);                                \
+        m1   = _mm_unpackhi_epi64(src0 , src1);                                \
+        m2   = _mm_unpacklo_epi64(src2 , src3);                                \
+        m3   = _mm_unpackhi_epi64(src2 , src3);                                \
+        src0 = _mm_unpacklo_epi32(tmp0, tmp1);                                 \
+        src1 = _mm_unpacklo_epi32(tmp2, tmp3);                                 \
+        src2 = _mm_unpackhi_epi32(tmp0, tmp1);                                 \
+        src3 = _mm_unpackhi_epi32(tmp2, tmp3);                                 \
+        m4   = _mm_unpacklo_epi64(src0 , src1);                                \
+        m5   = _mm_unpackhi_epi64(src0 , src1);                                \
+        m6   = _mm_unpacklo_epi64(src2 , src3);                                \
+        m7   = _mm_unpackhi_epi64(src2 , src3);                                \
+        STORE16(out, sstep_out);                                               \
+    }
+#define TRANSPOSE16x16_10(in, sstep_in, out, sstep_out)                        \
+    for (y = 0; y < sstep_in; y+=8)                                           \
+        for (x = 0; x < sstep_in; x+=8)                                       \
+            TRANSPOSE8x8_10((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out)
+#define TRANSPOSE32x32_10(in, sstep_in, out, sstep_out)                        \
+    for (y = 0; y < sstep_in; y+=8)                                           \
+        for (x = 0; x < sstep_in; x+=8)                                       \
+            TRANSPOSE8x8_10((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define ANGULAR_COMPUTE_8(W)                                                   \
+    for (x = 0; x < W; x += 8) {                                               \
+        r3 = _mm_set1_epi16((fact << 8) + (32 - fact));                        \
+        r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1]));                       \
+        r0 = _mm_srli_si128(r1, 1);                                            \
+        r1 = _mm_unpacklo_epi8(r1, r0);                                        \
+        r1 = _mm_maddubs_epi16(r1, r3);                                        \
+        r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024));                                           \
+        r1 = _mm_packus_epi16(r1, r1);                                         \
+        _mm_storel_epi64((__m128i *) &p_src[x], r1);                           \
+    }
+
+
+#define ANGULAR_COMPUTE4_8()                                                   \
+    r3 = _mm_set1_epi16((fact << 8) + (32 - fact));                            \
+    r1 = _mm_loadu_si128((__m128i*)(&ref[idx+1]));                             \
+    r0 = _mm_srli_si128(r1, 1);                                                \
+    r1 = _mm_unpacklo_epi8(r1, r0);                                            \
+    r1 = _mm_maddubs_epi16(r1, r3);                                            \
+    r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024));                                           \
+    r1 = _mm_packus_epi16(r1, r1);                                             \
+    *((uint32_t *)p_src) = _mm_cvtsi128_si32(r1)
+#define ANGULAR_COMPUTE8_8()     ANGULAR_COMPUTE_8( 8)
+#define ANGULAR_COMPUTE16_8()    ANGULAR_COMPUTE_8(16)
+#define ANGULAR_COMPUTE32_8()    ANGULAR_COMPUTE_8(32)
+
+#define ANGULAR_COMPUTE_ELSE4_8()                                              \
+    r1 = _mm_loadl_epi64((__m128i*) &ref[idx+1]);                              \
+    *((uint32_t *)p_src) = _mm_cvtsi128_si32(r1)
+#define ANGULAR_COMPUTE_ELSE8_8()                                              \
+    r1 = _mm_loadl_epi64((__m128i*) &ref[idx+1]);                              \
+    _mm_storel_epi64((__m128i *) p_src, r1)
+#define ANGULAR_COMPUTE_ELSE16_8()                                             \
+    r1 = _mm_loadu_si128((__m128i*) &ref[idx+1]);                              \
+    _mm_storeu_si128((__m128i *) p_src, r1)
+#define ANGULAR_COMPUTE_ELSE32_8()                                             \
+    r1 = _mm_loadu_si128((__m128i*) &ref[idx+1]);                              \
+    _mm_storeu_si128((__m128i *) p_src ,r1);                                   \
+    r1 = _mm_loadu_si128((__m128i*) &ref[idx+17]);                             \
+    _mm_storeu_si128((__m128i *)&p_src[16] ,r1)
+
+#define CLIP_PIXEL(src1, src2)                                                 \
+    r3  = _mm_loadu_si128((__m128i*)src1);                                     \
+    r1  = _mm_set1_epi16(src1[-1]);                                            \
+    r2  = _mm_set1_epi16(src2[0]);                                             \
+    r0  = _mm_unpacklo_epi8(r3,_mm_setzero_si128());                           \
+    r0  = _mm_subs_epi16(r0, r1);                                              \
+    r0  = _mm_srai_epi16(r0, 1);                                               \
+    r0  = _mm_add_epi16(r0, r2)
+#define CLIP_PIXEL_HI()                                                        \
+    r3  = _mm_unpackhi_epi8(r3,_mm_setzero_si128());                           \
+    r3  = _mm_subs_epi16(r3, r1);                                              \
+    r3  = _mm_srai_epi16(r3, 1);                                               \
+    r3  = _mm_add_epi16(r3, r2)
+
+#define CLIP_PIXEL1_4_8()                                                      \
+    p_src = src;                                                               \
+    CLIP_PIXEL(src2, src1);                                                    \
+    r0  = _mm_packus_epi16(r0, r0);                                            \
+    *((char *) p_src) = _mm_extract_epi8(r0, 0);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 1);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 2);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 3)
+#define CLIP_PIXEL1_8_8()                                                      \
+    CLIP_PIXEL1_4_8();                                                         \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 4);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 5);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 6);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 7)
+#define CLIP_PIXEL1_16_8()                                                     \
+    p_src = src;                                                               \
+    CLIP_PIXEL(src2, src1);                                                    \
+    CLIP_PIXEL_HI();                                                           \
+    r0  = _mm_packus_epi16(r0, r3);                                            \
+    *((char *) p_src) = _mm_extract_epi8(r0, 0);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 1);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 2);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 3);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 4);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 5);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 6);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 7);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 8);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 9);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,10);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,11);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,12);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,13);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,14);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,15)
+#define CLIP_PIXEL1_32_8()
+
+#define CLIP_PIXEL2_4_8()                                                      \
+    CLIP_PIXEL(src2, src1);                                                    \
+    r0  = _mm_packus_epi16(r0, r0);                                            \
+    *((uint32_t *)_src) = _mm_cvtsi128_si32(r0)
+#define CLIP_PIXEL2_8_8()                                                      \
+    CLIP_PIXEL(src2, src1);                                                    \
+    r0  = _mm_packus_epi16(r0, r0);                                            \
+    _mm_storel_epi64((__m128i*)_src, r0)
+#define CLIP_PIXEL2_16_8()                                                     \
+    CLIP_PIXEL(src2, src1);                                                    \
+    CLIP_PIXEL_HI();                                                           \
+    r0  = _mm_packus_epi16(r0, r3);                                            \
+    _mm_storeu_si128((__m128i*) _src , r0)
+#define CLIP_PIXEL2_32_8()
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#if HAVE_SSE4
+#define ANGULAR_COMPUTE_10(W)                                                  \
+    for (x = 0; x < W; x += 4) {                                               \
+        r3 = _mm_set1_epi32((fact << 16) + (32 - fact));                       \
+        r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1]));                       \
+        r0 = _mm_srli_si128(r1, 2);                                            \
+        r1 = _mm_unpacklo_epi16(r1, r0);                                       \
+        r1 = _mm_madd_epi16(r1, r3);                                           \
+        r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024));                                           \
+        r1 = _MM_PACKUS_EPI32(r1, r1);                                         \
+        _mm_storel_epi64((__m128i *) &p_src[x], r1);                           \
+    }
+#define ANGULAR_COMPUTE4_10()    ANGULAR_COMPUTE_10( 4)
+#define ANGULAR_COMPUTE8_10()    ANGULAR_COMPUTE_10( 8)
+#define ANGULAR_COMPUTE16_10()   ANGULAR_COMPUTE_10(16)
+#define ANGULAR_COMPUTE32_10()   ANGULAR_COMPUTE_10(32)
+
+#define ANGULAR_COMPUTE_ELSE_10(W)                                             \
+    for (x = 0; x < W; x += 8) {                                               \
+        r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1]));                       \
+        _mm_storeu_si128((__m128i *) &p_src[x], r1);                           \
+    }
+
+#define ANGULAR_COMPUTE_ELSE4_10()                                             \
+    r1 = _mm_loadl_epi64((__m128i*)(&ref[idx+1]));                             \
+    _mm_storel_epi64((__m128i *) p_src, r1)
+
+#define ANGULAR_COMPUTE_ELSE8_10()      ANGULAR_COMPUTE_ELSE_10(8)
+#define ANGULAR_COMPUTE_ELSE16_10()     ANGULAR_COMPUTE_ELSE_10(16)
+#define ANGULAR_COMPUTE_ELSE32_10()     ANGULAR_COMPUTE_ELSE_10(32)
+
+#define CLIP_PIXEL_10()                                                        \
+    r0  = _mm_loadu_si128((__m128i*)src2);                                     \
+    r1  = _mm_set1_epi16(src2[-1]);                                            \
+    r2  = _mm_set1_epi16(src1[0]);                                             \
+    r0  = _mm_subs_epi16(r0, r1);                                              \
+    r0  = _mm_srai_epi16(r0, 1);                                               \
+    r0  = _mm_add_epi16(r0, r2)
+#define CLIP_PIXEL_HI_10()                                                     \
+    r3  = _mm_loadu_si128((__m128i*)&src2[8]);                                 \
+    r3  = _mm_subs_epi16(r3, r1);                                              \
+    r3  = _mm_srai_epi16(r3, 1);                                               \
+    r3  = _mm_add_epi16(r3, r2)
+
+#define CLIP_PIXEL1_4_10()                                                     \
+    p_src = src;                                                               \
+    CLIP_PIXEL_10();                                                           \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 0);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 1);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 2);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 3)
+#define CLIP_PIXEL1_8_10()                                                     \
+    CLIP_PIXEL1_4_10();                                                        \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 4);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 5);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 6);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 7)
+#define CLIP_PIXEL1_16_10()                                                    \
+    p_src = src;                                                               \
+    CLIP_PIXEL_10();                                                           \
+    CLIP_PIXEL_HI_10();                                                        \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    r3  = _mm_max_epi16(r3, _mm_setzero_si128());                              \
+    r3  = _mm_min_epi16(r3, _mm_set1_epi16(0x03ff));                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 0);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 1);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 2);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 3);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 4);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 5);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 6);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 7);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 0);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 1);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 2);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 3);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 4);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 5);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 6);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 7)
+#define CLIP_PIXEL1_32_10()
+
+#define CLIP_PIXEL2_4_10()                                                     \
+    CLIP_PIXEL_10();                                                           \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    _mm_storel_epi64((__m128i*) _src    , r0)
+#define CLIP_PIXEL2_8_10()                                                     \
+    CLIP_PIXEL_10();                                                           \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    _mm_storeu_si128((__m128i*) _src    , r0)
+#define CLIP_PIXEL2_16_10()                                                    \
+    CLIP_PIXEL_10();                                                           \
+    CLIP_PIXEL_HI_10();                                                        \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    r3  = _mm_max_epi16(r3, _mm_setzero_si128());                              \
+    r3  = _mm_min_epi16(r3, _mm_set1_epi16(0x03ff));                           \
+    _mm_storeu_si128((__m128i*) p_out    , r0);                                \
+    _mm_storeu_si128((__m128i*) &p_out[8], r3);
+
+#define CLIP_PIXEL2_32_10()
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PRED_ANGULAR_INIT_8(W)                                                 \
+    const uint8_t *src1;                                                       \
+    const uint8_t *src2;                                                       \
+    uint8_t       *ref, *p_src, *src, *p_out;                                  \
+    uint8_t        src_tmp[W*W];                                               \
+    if (mode >= 18) {                                                          \
+        src1   = (const uint8_t*) _top;                                        \
+        src2   = (const uint8_t*) _left;                                       \
+        src    = (uint8_t*) _src;                                              \
+        stride = _stride;                                                      \
+        p_src  = src;                                                          \
+    } else {                                                                   \
+        src1   = (const uint8_t*) _left;                                       \
+        src2   = (const uint8_t*) _top;                                        \
+        src    = &src_tmp[0];                                                  \
+        stride = W;                                                            \
+        p_src  = src;                                                          \
+    }                                                                          \
+    p_out  = (uint8_t*) _src;                                                  \
+    ref = (uint8_t*) (src1 - 1)
+#define PRED_ANGULAR_INIT_10(W)                                                \
+    const uint16_t *src1;                                                      \
+    const uint16_t *src2;                                                      \
+    uint16_t       *ref, *p_src, *src, *p_out;                                 \
+    uint16_t        src_tmp[W*W];                                              \
+    if (mode >= 18) {                                                          \
+        src1   = (const uint16_t*) _top;                                       \
+        src2   = (const uint16_t*) _left;                                      \
+        src    = (uint16_t*) _src;                                             \
+        stride = _stride;                                                      \
+        p_src  = src;                                                          \
+    } else {                                                                   \
+        src1   = (const uint16_t*) _left;                                      \
+        src2   = (const uint16_t*) _top;                                       \
+        src    = &src_tmp[0];                                                  \
+        stride = W;                                                            \
+        p_src  = src;                                                          \
+    }                                                                          \
+    p_out  = (uint16_t*) _src;                                                 \
+    ref = (uint16_t*) (src1 - 1)
+
+#define PRED_ANGULAR_WAR()                                                     \
+    int y;                                                                     \
+    __m128i r0, r1, r3
+
+#define PRED_ANGULAR_WAR4_8()                                                  \
+    PRED_ANGULAR_WAR();                                                        \
+    __m128i r2
+#define PRED_ANGULAR_WAR8_8()                                                  \
+    PRED_ANGULAR_WAR4_8();                                                       \
+    int x
+#define PRED_ANGULAR_WAR16_8()                                                 \
+    PRED_ANGULAR_WAR8_8()
+#define PRED_ANGULAR_WAR32_8()                                                 \
+    PRED_ANGULAR_WAR();                                                        \
+    int x
+
+#define PRED_ANGULAR_WAR4_10()    PRED_ANGULAR_WAR8_8()
+#define PRED_ANGULAR_WAR8_10()    PRED_ANGULAR_WAR8_8()
+#define PRED_ANGULAR_WAR16_10()   PRED_ANGULAR_WAR16_8()
+#define PRED_ANGULAR_WAR32_10()   PRED_ANGULAR_WAR32_8()
+
+#define PRED_ANGULAR(W, D)                                                     \
+static av_always_inline void pred_angular_ ## W ##_ ## D ## _sse(uint8_t *_src,\
+        const uint8_t *_top, const uint8_t *_left, ptrdiff_t _stride, int c_idx, int mode) {\
+    const int intra_pred_angle[] = {                                           \
+         32, 26, 21, 17, 13,  9,  5,  2,  0, -2, -5, -9,-13,-17,-21,-26,       \
+        -32,-26,-21,-17,-13, -9, -5, -2,  0,  2,  5,  9, 13, 17, 21, 26, 32    \
+    };                                                                         \
+    const int inv_angle[] = {                                                  \
+        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,    \
+        -630, -910, -1638, -4096                                               \
+    };                                                                         \
+    PRED_ANGULAR_WAR ## W ## _ ## D();                                         \
+    int            angle   = intra_pred_angle[mode-2];                         \
+    int            angle_i = angle;                                            \
+    int            last    = (W * angle) >> 5;                                 \
+    int            stride;                                                     \
+    PRED_ANGULAR_INIT_ ## D(W);                                                \
+    if (angle < 0 && last < -1) {                                              \
+        for (y = last; y <= -1; y++)                                           \
+            ref[y] = src2[-1 + ((y * inv_angle[mode-11] + 128) >> 8)];         \
+    }                                                                          \
+    for (y = 0; y < W; y++) {                                                  \
+        int idx  = (angle_i) >> 5;                                             \
+        int fact = (angle_i) & 31;                                             \
+        if (fact) {                                                            \
+            ANGULAR_COMPUTE ## W ## _ ## D();                                  \
+        } else {                                                               \
+            ANGULAR_COMPUTE_ELSE ## W ## _ ## D();                             \
+        }                                                                      \
+        angle_i += angle;                                                      \
+        p_src   += stride;                                                     \
+    }                                                                          \
+    if (mode >= 18) {                                                          \
+        if (mode == 26 && c_idx == 0) {                                        \
+            CLIP_PIXEL1_ ## W ## _ ## D();                                     \
+        }                                                                      \
+    } else {                                                                   \
+        TRANSPOSE ## W ## x ## W ## _ ## D(src_tmp, W, p_out, _stride);        \
+        if (mode == 10 && c_idx == 0) {                                        \
+            CLIP_PIXEL2_ ## W ## _ ## D();                                     \
+        }                                                                      \
+    }                                                                          \
+}
+
+PRED_ANGULAR( 4, 8)
+PRED_ANGULAR( 8, 8)
+PRED_ANGULAR(16, 8)
+PRED_ANGULAR(32, 8)
+
+PRED_ANGULAR( 4,10)
+PRED_ANGULAR( 8,10)
+PRED_ANGULAR(16,10)
+PRED_ANGULAR(32,10)
+
+void pred_angular_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_4_8_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+        ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_8_8_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+        ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_16_8_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+        ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_32_8_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+
+void pred_angular_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_4_10_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_8_10_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_16_10_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_32_10_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+#endif
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#endif
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@ -263,4 +263,24 @@ void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t
 void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);

+void ff_hevc_transform_4x4_luma_8_sse2(int16_t *coeffs);
+void ff_hevc_transform_4x4_luma_10_sse2(int16_t *coeffs);
+void ff_hevc_transform_4x4_luma_12_sse2(int16_t *coeffs);
+
+#define IDCT_FUNC(s, b) void ff_hevc_transform_ ## s ## x ## s ##_## b ##_sse2\
+            (int16_t *coeffs, int col_limit);
+
+IDCT_FUNC(4, 8)
+IDCT_FUNC(4, 10)
+IDCT_FUNC(4, 12)
+IDCT_FUNC(8, 8)
+IDCT_FUNC(8, 10)
+IDCT_FUNC(8, 12)
+IDCT_FUNC(16, 8)
+IDCT_FUNC(16, 10)
+IDCT_FUNC(16, 12)
+IDCT_FUNC(32, 8)
+IDCT_FUNC(32, 10)
+IDCT_FUNC(32, 12)
+
 #endif // AVCODEC_X86_HEVCDSP_H
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@ -835,6 +835,13 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
+
+            /* intrinsics */
+            c->transform_4x4_luma = ff_hevc_transform_4x4_luma_8_sse2;
+            if (!ARCH_X86_64) {
+                c->idct[2] = ff_hevc_transform_16x16_8_sse2;
+                c->idct[3] = ff_hevc_transform_32x32_8_sse2;
+            }
        }
        if (EXTERNAL_SSSE3(cpu_flags)) {
            if(ARCH_X86_64) {
@ -1010,6 +1017,13 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
            c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
            c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
+
+            /* intrinsics  */
+            c->transform_4x4_luma = ff_hevc_transform_4x4_luma_10_sse2;
+            if (!ARCH_X86_64) {
+                c->idct[2] = ff_hevc_transform_16x16_10_sse2;
+                c->idct[3] = ff_hevc_transform_32x32_10_sse2;
+            }
        }
        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@ -1215,6 +1229,13 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
+
+            /* intrinsics */
+            c->transform_4x4_luma = ff_hevc_transform_4x4_luma_12_sse2;
+            c->idct[0] = ff_hevc_transform_4x4_12_sse2;
+            c->idct[1] = ff_hevc_transform_8x8_12_sse2;
+            c->idct[2] = ff_hevc_transform_16x16_12_sse2;
+            c->idct[3] = ff_hevc_transform_32x32_12_sse2;
        }
        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
@ -1252,3 +1273,37 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
        }
    }
 }
+
+#include "libavcodec/hevcpred.h"
+#include "libavcodec/x86/hevcpred.h"
+
+#undef FUNC
+#define FUNC(a, depth) a ## _ ## depth ## _sse
+
+#define HEVC_PRED(depth)                      \
+    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
+    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
+    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
+    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
+    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
+    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
+    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
+    hpc->pred_angular[3] = FUNC(pred_angular_3, depth)
+
+void ff_hevc_pred_init_x86(HEVCPredContext *hpc, int bit_depth)
+{
+    int mm_flags = av_get_cpu_flags();
+
+#ifndef _MSC_VER
+    if (bit_depth == 8) {
+        if (EXTERNAL_SSE4(mm_flags)) {
+            HEVC_PRED(8);
+        }
+    }
+    if (bit_depth == 10) {
+        if (EXTERNAL_SSE4(mm_flags)) {
+            HEVC_PRED(10);
+        }
+    }
+#endif
+}
--- a/libavcodec/x86/hevcpred.h
+++ b/libavcodec/x86/hevcpred.h
@ -0,0 +1,24 @@
+#ifndef AVCODEC_X86_HEVCPRED_H
+#define AVCODEC_X86_HEVCPRED_H
+
+void pred_planar_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+
+void pred_angular_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+
+void pred_planar_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+
+void pred_angular_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+
+#endif // AVCODEC_X86_HEVCPRED_H
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@ -352,8 +352,8 @@ OBJS-$(CONFIG_MATROSKA_MUXER)            += matroskaenc.o matroska.o \
 OBJS-$(CONFIG_MCA_DEMUXER)               += mca.o
 OBJS-$(CONFIG_MATROSKA_HAALI_DEMUXER)    += matroskadec_haali.o matroska.o \
                                            MatroskaParser.o \
-                                            isom.o rmsipr.o flac_picture.o \
-                                            oggparsevorbis.o vorbiscomment.o
+                                            isom_tags.o rmsipr.o flac_picture.o \
+                                            oggparsevorbis.o vorbiscomment.o dovi_isom.o
 OBJS-$(CONFIG_MCC_DEMUXER)               += mccdec.o subtitles.o
 OBJS-$(CONFIG_MD5_MUXER)                 += hashenc.o
 OBJS-$(CONFIG_MGSTS_DEMUXER)             += mgsts.o
@ -734,6 +734,7 @@ SHLIBOBJS-$(CONFIG_HLS_DEMUXER)          += ac3_channel_layout_tab.o
 SHLIBOBJS-$(CONFIG_IMAGE_JPEGXL_PIPE_DEMUXER)    += jpegxl_parse.o
 SHLIBOBJS-$(CONFIG_JPEGXL_ANIM_DEMUXER)  += jpegxl_parse.o
 SHLIBOBJS-$(CONFIG_MATROSKA_DEMUXER)     += mpeg4audio_sample_rates.o
+SHLIBOBJS-$(CONFIG_MATROSKA_HAALI_DEMUXER) += mpeg4audio_sample_rates.o
 SHLIBOBJS-$(CONFIG_MOV_DEMUXER)          += ac3_channel_layout_tab.o
 SHLIBOBJS-$(CONFIG_MP3_MUXER)            += mpegaudiotabs.o
 SHLIBOBJS-$(CONFIG_MXF_MUXER)            += golomb_tab.o \
--- a/libavformat/MatroskaParser.c
+++ b/libavformat/MatroskaParser.c
--- a/libavformat/MatroskaParser.h
+++ b/libavformat/MatroskaParser.h
@ -113,6 +113,13 @@ typedef struct MatroskaFile MatroskaFile;
 #define	TT_AUDIO    2
 #define	TT_SUB	    17

+struct BlockAdditionMapping {
+  unsigned int ID;
+  unsigned int Type;
+  unsigned int Length;
+  void *Data;
+};
+
 struct TrackInfo {
  unsigned char	  Number;
  unsigned char	  Type;
@ -121,6 +128,8 @@ struct TrackInfo {
  ulonglong	  MinCache;
  ulonglong	  MaxCache;
  ulonglong	  DefaultDuration;
+  ulonglong	  CodecDelay;
+  ulonglong	  SeekPreRoll;
  MKFLOAT	  TimecodeScale;
  void		  *CodecPrivate;
  unsigned	  CodecPrivateSize;
@ -132,6 +141,11 @@ struct TrackInfo {
  unsigned int  Enabled:1;
  unsigned int  Default:1;
  unsigned int  Forced:1;
+  unsigned int  HearingImpaired:1;
+  unsigned int  VisualImpaired:1;
+  unsigned int  TextDescriptions:1;
+  unsigned int  OriginalLanguage:1;
+  unsigned int  Commentary:1;
  unsigned int  Lacing:1;
  unsigned int  DecodeAll:1;
  unsigned int  CompEnabled:1;
@ -148,6 +162,42 @@ struct TrackInfo {
      unsigned int    CropL, CropT, CropR, CropB;
      unsigned int    ColourSpace;
      MKFLOAT	      GammaValue;
+      struct {
+          unsigned int MatrixCoefficients;
+          unsigned int BitsPerChannel;
+          unsigned int ChromaSubsamplingHorz;
+          unsigned int ChromaSubsamplingVert;
+          unsigned int CbSubsamplingHorz;
+          unsigned int CbSubsamplingVert;
+          unsigned int ChromaSitingHorz;
+          unsigned int ChromaSitingVert;
+          unsigned int Range;
+          unsigned int TransferCharacteristics;
+          unsigned int Primaries;
+          unsigned int MaxCLL;
+          unsigned int MaxFALL;
+          struct {
+              float PrimaryRChromaticityX;
+              float PrimaryRChromaticityY;
+              float PrimaryGChromaticityX;
+              float PrimaryGChromaticityY;
+              float PrimaryBChromaticityX;
+              float PrimaryBChromaticityY;
+              float WhitePointChromaticityX;
+              float WhitePointChromaticityY;
+              float LuminanceMax;
+              float LuminanceMin;
+          } MasteringMetadata;
+      } Colour;
+
+      struct {
+        unsigned int ProjectionType;
+        char ProjectionPrivate[20];
+        unsigned ProjectionPrivateSize;
+        MKFLOAT ProjectionPoseYaw;
+        MKFLOAT ProjectionPosePitch;
+        MKFLOAT ProjectionPoseRoll;
+      } Projection;

      unsigned int  Interlaced:1;
    } Video;
@ -163,6 +213,12 @@ struct TrackInfo {
  char			*Name;
  char			Language[4];
  char			*CodecID;
+
+  unsigned int NeedKeyframes;
+
+  // BlockAdditionMappings
+  unsigned int nBlockAdditionMappings,nBlockAdditionMappingsSize;
+  struct BlockAdditionMapping *BlockAdditionMappings;
 };

 typedef struct TrackInfo  TrackInfo;
@ -244,7 +300,9 @@ typedef struct Chapter	Chapter;

 struct Cue {
  ulonglong        Time;
+  ulonglong        Duration;
  ulonglong        Position;
+  ulonglong        RelativePosition;
  ulonglong        Block;
  unsigned char        Track;
 };
@ -292,6 +350,11 @@ X MatroskaFile  *mkv_OpenEx(/* in */  InputStream *io,
 			  /* out */ char *err_msg,
 			  /* in */  unsigned msgsize);

+/* Open the file and only parse enough information to find the segment uid */
+X MatroskaFile  *mkv_OpenSparse(/* in */ InputStream *io,
+        /* out */ char *err_msg,
+        /* in */  unsigned msgsize);
+
 /* Close and deallocate mf
 * NULL pointer is ok and is simply ignored
 */
@ -334,6 +397,8 @@ X void	      mkv_Seek(/* in */ MatroskaFile *mf,
 		       /* in */	ulonglong timecode /* in ns */,
 		       /* in */ unsigned flags);

+X void mkv_Seek_CueAware(MatroskaFile *mf, ulonglong timecode, unsigned flags, unsigned fuzzy);
+
 X void	      mkv_SkipToKeyframe(MatroskaFile *mf);

 X ulonglong   mkv_GetLowestQTimecode(MatroskaFile *mf);
@ -372,7 +437,11 @@ X int	      mkv_ReadFrame(/* in */  MatroskaFile *mf,
 			    /* out */ ulonglong *FilePos /* in bytes from start of file */,
 			    /* out */ unsigned int *FrameSize /* in bytes */,
 			    /* out */ char **FrameData,
-			    /* out */ unsigned int *FrameFlags);
+			    /* out */ unsigned int *FrameFlags,
+			    /* out */ longlong *FrameDiscard,
+			    /* out */ unsigned int *FrameAdditionalSize, /* in bytes */
+			    /* out */ char **FrameAdditionalData,
+			    /* out */ unsigned int *FrameAdditionalID);

 #ifdef MATROSKA_COMPRESSION_SUPPORT
 /* Compressed streams support */
--- a/libavformat/asfdec_f.c
+++ b/libavformat/asfdec_f.c
@ -1536,7 +1536,7 @@ static int asf_build_simple_index(AVFormatContext *s, int stream_index)
            int pktnum        = avio_rl32(s->pb);
            int pktct         = avio_rl16(s->pb);
            int64_t pos       = ffformatcontext(s)->data_offset + s->packet_size * (int64_t)pktnum;
-            int64_t index_pts = FFMAX(av_rescale(itime, i, 10000) - asf->hdr.preroll, 0);
+            int64_t index_pts = FFMAX(av_rescale(itime, i, 10000), 0);

            if (avio_feof(s->pb)) {
                ret = AVERROR_INVALIDDATA;
--- a/libavformat/avformat.h
+++ b/libavformat/avformat.h
@ -1430,6 +1430,7 @@ typedef struct AVFormatContext {
 #endif
 #define AVFMT_FLAG_AUTO_BSF   0x200000 ///< Add bitstream filters as requested by the muxer

+#define AVFMT_FLAG_NOEXTERNAL     0x40000000 ///< Do not open external files referenced by the format
 #define AVFMT_FLAG_NETWORK        0x80000000 ///< Source is a network protocol, optimize for that

    /**
@ -3066,4 +3067,12 @@ AVRational av_stream_get_codec_timebase(const AVStream *st);
 * @}
 */

+enum AVStreamParseType av_lav_stream_parser_get_needed(const AVStream *st);
+void av_lav_stream_parser_set_needed(AVStream *st, enum AVStreamParseType needed);
+void av_lav_stream_parser_init(AVStream *st);
+int av_lav_stream_parser_get_flags(const AVStream *st);
+void av_lav_stream_parser_update_flags(AVStream *st, int flags);
+int av_lav_stream_codec_info_nb_frames(const AVStream *st);
+int av_lav_stream_get_timing_info(const AVStream *st, AVRational *tb, int *ticks_per_frame);
+
 #endif /* AVFORMAT_AVFORMAT_H */
--- a/libavformat/avidec.c
+++ b/libavformat/avidec.c
@ -710,7 +710,8 @@ static int avi_read_header(AVFormatContext *s)
                codec_type = AVMEDIA_TYPE_AUDIO;
                break;
            case MKTAG('t', 'x', 't', 's'):
-                codec_type = AVMEDIA_TYPE_SUBTITLE;
+                st->codecpar->codec_type = codec_type = AVMEDIA_TYPE_SUBTITLE;
+                ffstream(st)->request_probe = 1;
                break;
            case MKTAG('d', 'a', 't', 's'):
                codec_type = AVMEDIA_TYPE_DATA;
--- a/libavformat/avio.c
+++ b/libavformat/avio.c
@ -32,7 +32,7 @@
 #endif
 #include "url.h"

-#define IO_BUFFER_SIZE 32768
+#define IO_BUFFER_SIZE 131072

 /** @name Logging context. */
 /*@{*/
--- a/libavformat/avio.h
+++ b/libavformat/avio.h
@ -643,6 +643,8 @@ int avio_get_str16be(AVIOContext *pb, int maxlen, char *buf, int buflen);
 */
 #define AVIO_FLAG_DIRECT 0x8000

+#define AVIO_FLAG_AVOID_FSTAT 0x80000000
+
 /**
 * Create and initialize a AVIOContext for accessing the
 * resource indicated by url.
--- a/libavformat/aviobuf.c
+++ b/libavformat/aviobuf.c
@ -32,7 +32,7 @@
 #include "internal.h"
 #include <stdarg.h>

-#define IO_BUFFER_SIZE 32768
+#define IO_BUFFER_SIZE 131072

 /**
 * Do seeks within this distance ahead of the current buffer by skipping
--- a/libavformat/demux.c
+++ b/libavformat/demux.c
@ -470,7 +470,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in

    if (ref == AV_NOPTS_VALUE)
        ref = pkt->pts;
-    if (sti->pts_wrap_reference != AV_NOPTS_VALUE || st->pts_wrap_bits >= 63 || ref == AV_NOPTS_VALUE || !s->correct_ts_overflow)
+    if (sti->pts_wrap_reference != AV_NOPTS_VALUE || st->pts_wrap_bits >= 63 || ref == AV_NOPTS_VALUE || !s->correct_ts_overflow || st->codecpar->codec_id == AV_CODEC_ID_DVB_TELETEXT)
        return 0;
    ref &= (1LL << st->pts_wrap_bits)-1;

@ -1820,7 +1820,7 @@ static void estimate_timings_from_bit_rate(AVFormatContext *ic)
 }

 #define DURATION_MAX_READ_SIZE 250000LL
-#define DURATION_MAX_RETRY 6
+#define DURATION_MAX_RETRY 7

 /* only usable for MPEG-PS streams */
 static void estimate_timings_from_pts(AVFormatContext *ic, int64_t old_offset)
@ -2994,10 +2994,12 @@ int avformat_find_stream_info(AVFormatContext *ic, AVDictionary **options)
            if (!st->r_frame_rate.num) {
                const AVCodecDescriptor *desc = sti->codec_desc;
                AVRational mul = (AVRational){ desc && (desc->props & AV_CODEC_PROP_FIELDS) ? 2 : 1, 1 };
-                AVRational  fr = av_mul_q(avctx->framerate, mul);
+                AVRational time_base = av_inv_q(av_mul_q(avctx->framerate, mul));

-                if (fr.num && fr.den && av_cmp_q(st->time_base, av_inv_q(fr)) <= 0) {
-                    st->r_frame_rate = fr;
+                if (   time_base.den * (int64_t) st->time_base.num
+                    <= time_base.num * (uint64_t)mul.num * st->time_base.den) {
+                    av_reduce(&st->r_frame_rate.num, &st->r_frame_rate.den,
+                              time_base.den, (int64_t)time_base.num * mul.num, INT_MAX);
                } else {
                    st->r_frame_rate.num = st->time_base.den;
                    st->r_frame_rate.den = st->time_base.num;
--- a/libavformat/file.c
+++ b/libavformat/file.c
@ -323,7 +323,7 @@ static int file_open(URLContext *h, const char *filename, int flags)
        if (c->trunc)
            access |= O_TRUNC;
    } else {
-        access = O_RDONLY;
+        access = O_RDONLY | O_SEQUENTIAL;
    }
 #ifdef O_BINARY
    access |= O_BINARY;
@ -333,7 +333,7 @@ static int file_open(URLContext *h, const char *filename, int flags)
        return AVERROR(errno);
    c->fd = fd;

-    h->is_streamed = !fstat(fd, &st) && S_ISFIFO(st.st_mode);
+    h->is_streamed = !(flags & AVIO_FLAG_AVOID_FSTAT) && !fstat(fd, &st) && S_ISFIFO(st.st_mode);

    /* Buffer writes more than the default 32k to improve throughput especially
     * with networked file systems */
--- a/libavformat/flacdec.c
+++ b/libavformat/flacdec.c
@ -56,7 +56,7 @@ static int flac_read_header(AVFormatContext *s)
 {
    int ret, metadata_last=0, metadata_type, metadata_size, found_streaminfo=0;
    uint8_t header[4];
-    uint8_t *buffer=NULL;
+    uint8_t *buffer=NULL, *tmp=NULL;
    uint32_t marker;
    FLACDecContext *flac = s->priv_data;
    AVStream *st = avformat_new_stream(s, NULL);
@ -117,14 +117,20 @@ static int flac_read_header(AVFormatContext *s)
                RETURN_ERROR(AVERROR_INVALIDDATA);
            }
            found_streaminfo = 1;
-            st->codecpar->extradata      = buffer;
-            st->codecpar->extradata_size = metadata_size;
-            buffer = NULL;
+            st->codecpar->extradata      = av_malloc(metadata_size + 8 + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!st->codecpar->extradata) {
+              RETURN_ERROR(AVERROR(ENOMEM));
+            }
+            st->codecpar->extradata_size = metadata_size + 8;
+            AV_WL32(st->codecpar->extradata, MKTAG('f','L','a','C'));
+            memcpy(st->codecpar->extradata + 4, header, 4);
+            memcpy(st->codecpar->extradata + 8, buffer, metadata_size);
+            av_freep(&buffer);

            /* get sample rate and sample count from STREAMINFO header;
             * other parameters will be extracted by the parser */
-            samplerate = AV_RB24(st->codecpar->extradata + 10) >> 4;
-            samples    = (AV_RB64(st->codecpar->extradata + 13) >> 24) & ((1ULL << 36) - 1);
+            samplerate = AV_RB24(st->codecpar->extradata + 8 + 10) >> 4;
+            samples    = (AV_RB64(st->codecpar->extradata + 8 + 13) >> 24) & ((1ULL << 36) - 1);

            /* set time base and duration */
            if (samplerate > 0) {
@ -188,6 +194,16 @@ static int flac_read_header(AVFormatContext *s)
            /* process supported blocks other than STREAMINFO */
            if (metadata_type == FLAC_METADATA_TYPE_VORBIS_COMMENT) {
                AVDictionaryEntry *chmask;
+                /* append VorbisComment to extradata */
+                tmp = av_realloc(st->codecpar->extradata, st->codecpar->extradata_size + 4 + metadata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!tmp) {
+                  RETURN_ERROR(AVERROR(ENOMEM));
+                }
+                st->codecpar->extradata = tmp;
+                tmp += st->codecpar->extradata_size;
+                memcpy(tmp, header, 4);
+                memcpy(tmp + 4, buffer, metadata_size);
+                st->codecpar->extradata_size = st->codecpar->extradata_size + 4 + metadata_size;

                ret = ff_vorbis_comment(s, &s->metadata, buffer, metadata_size, 1);
                if (ret < 0) {
--- a/libavformat/flic.c
+++ b/libavformat/flic.c
@ -269,6 +269,13 @@ static int flic_read_seek(AVFormatContext *s, int stream_index,
    int64_t pos, ts;
    int index;

+    if (pts == 0) {
+        flic->frame_number = 0;
+        avio_seek(s->pb, s->streams[flic->video_stream_index]->codecpar->extradata_size, SEEK_SET);
+
+        return 0;
+    }
+
    if (!sti->index_entries || stream_index != flic->video_stream_index)
        return -1;

--- a/libavformat/flvdec.c
+++ b/libavformat/flvdec.c
@ -1285,6 +1285,8 @@ retry:
                    av_log(s, AV_LOG_WARNING, "Adjusting next position due to index mismatch\n");
                    next = flv->validate_index[0].pos - 4;
                }
+                if (type < 0)
+                    flv->broken_sizes = 1;
                goto skip;
            } else if (type == TYPE_ONTEXTDATA) {
                avpriv_request_sample(s, "OnTextData packet");
--- a/libavformat/ftp.c
+++ b/libavformat/ftp.c
@ -30,6 +30,7 @@
 #include "urldecode.h"
 #include "libavutil/opt.h"
 #include "libavutil/bprint.h"
+#include "version.h"

 #define CONTROL_BUFFER_SIZE 1024
 #define DIR_BUFFER_SIZE 4096
@ -544,6 +545,7 @@ static int ftp_features(FTPContext *s)
 {
    static const char *feat_command        = "FEAT\r\n";
    static const char *enable_utf8_command = "OPTS UTF8 ON\r\n";
+    static const char *clnt_command = "CLNT " LIBAVFORMAT_IDENT "\r\n";
    static const int feat_codes[] = {211, 0};
    static const int opts_codes[] = {200, 202, 451, 0};

@ -553,7 +555,13 @@ static int ftp_features(FTPContext *s)
    }

    if (ftp_has_feature(s, "UTF8")) {
-        int ret = ftp_send_command(s, enable_utf8_command, opts_codes, NULL);
+        int ret;
+
+        if (ftp_has_feature(s, "CLNT")) {
+            ftp_send_command(s, clnt_command, NULL, NULL);
+        }
+
+        ret = ftp_send_command(s, enable_utf8_command, opts_codes, NULL);
        if (ret == 200 || ret == 202)
            s->utf8 = 1;
    }
--- a/libavformat/hls.c
+++ b/libavformat/hls.c
@ -2454,6 +2454,10 @@ static int hls_read_seek(AVFormatContext *s, int stream_index,
    duration = s->duration == AV_NOPTS_VALUE ?
               0 : s->duration;

+    /* bound seeking to the beginning of the stream */
+    if (seek_timestamp < first_timestamp)
+        seek_timestamp = first_timestamp;
+
    if (0 < duration && duration < seek_timestamp - first_timestamp)
        return AVERROR(EIO);

--- a/libavformat/httpauth.c
+++ b/libavformat/httpauth.c
@ -226,9 +226,9 @@ static char *make_digest_auth(HTTPAuthState *state, const char *username,
    if (digest->opaque[0])
        av_strlcatf(authstr, len, ", opaque=\"%s\"", digest->opaque);
    if (digest->qop[0]) {
-        av_strlcatf(authstr, len, ", qop=\"%s\"",    digest->qop);
-        av_strlcatf(authstr, len, ", cnonce=\"%s\"", cnonce);
+        av_strlcatf(authstr, len, ", qop=%s",        digest->qop);
        av_strlcatf(authstr, len, ", nc=%s",         nc);
+        av_strlcatf(authstr, len, ", cnonce=\"%s\"", cnonce);
    }

    av_strlcatf(authstr, len, "\r\n");
--- a/libavformat/img2dec.c
+++ b/libavformat/img2dec.c
@ -492,7 +492,7 @@ int ff_img_read_packet(AVFormatContext *s1, AVPacket *pkt)
        } else if (!ffstream(s1->streams[0])->parser) {
            size[0] = avio_size(s1->pb);
        } else {
-            size[0] = 4096;
+            size[0]= avio_size(f[0]);
        }
    }

@ -550,15 +550,7 @@ int ff_img_read_packet(AVFormatContext *s1, AVPacket *pkt)
    }

    if (ret[0] <= 0 || ret[1] < 0 || ret[2] < 0) {
-        if (ret[0] < 0) {
-            res = ret[0];
-        } else if (ret[1] < 0) {
-            res = ret[1];
-        } else if (ret[2] < 0) {
-            res = ret[2];
-        } else {
-            res = AVERROR_EOF;
-        }
+        res = AVERROR_EOF;
        goto fail;
    } else {
        s->img_count++;
--- a/libavformat/isom.c
+++ b/libavformat/isom.c
@ -349,6 +349,8 @@ int ff_mp4_read_dec_config_descr(AVFormatContext *fc, AVStream *st, AVIOContext
           for MPEG-1 Audio or MPEG-2 Audio; MPEG-2 AAC excluded. */
        if (object_type_id == 0x69 || object_type_id == 0x6b)
            return 0;
+        if (!len)
+            return 0;
        if (!len || (uint64_t)len > (1<<30))
            return AVERROR_INVALIDDATA;
        if ((ret = ff_get_extradata(fc, st->codecpar, pb, len)) < 0)
--- a/libavformat/isom.h
+++ b/libavformat/isom.h
@ -289,7 +289,7 @@ typedef struct HEIFGrid {
 } HEIFGrid;

 typedef struct MOVContext {
-    const AVClass *class; ///< class for private options
+    const AVClass *avclass; ///< class for private options
    AVFormatContext *fc;
    int time_scale;
    int64_t duration;     ///< duration of the longest track
--- a/libavformat/matroska.c
+++ b/libavformat/matroska.c
@ -59,16 +59,12 @@ const CodecTags ff_mkv_codec_tags[]={
    {"A_VORBIS"         , AV_CODEC_ID_VORBIS},
    {"A_WAVPACK4"       , AV_CODEC_ID_WAVPACK},

-    {"D_WEBVTT/SUBTITLES"   , AV_CODEC_ID_WEBVTT},
-    {"D_WEBVTT/CAPTIONS"    , AV_CODEC_ID_WEBVTT},
-    {"D_WEBVTT/DESCRIPTIONS", AV_CODEC_ID_WEBVTT},
-    {"D_WEBVTT/METADATA"    , AV_CODEC_ID_WEBVTT},
-
    {"S_TEXT/UTF8"      , AV_CODEC_ID_SUBRIP},
    {"S_TEXT/UTF8"      , AV_CODEC_ID_TEXT},
    {"S_TEXT/ASCII"     , AV_CODEC_ID_TEXT},
    {"S_TEXT/ASS"       , AV_CODEC_ID_ASS},
    {"S_TEXT/SSA"       , AV_CODEC_ID_ASS},
+    {"S_TEXT/WEBVTT"    , AV_CODEC_ID_WEBVTT},
    {"S_ASS"            , AV_CODEC_ID_ASS},
    {"S_SSA"            , AV_CODEC_ID_ASS},
    {"S_VOBSUB"         , AV_CODEC_ID_DVD_SUBTITLE},
@ -77,6 +73,11 @@ const CodecTags ff_mkv_codec_tags[]={
    {"S_HDMV/TEXTST"    , AV_CODEC_ID_HDMV_TEXT_SUBTITLE},
    {"S_ARIBSUB"        , AV_CODEC_ID_ARIB_CAPTION},

+    {"D_WEBVTT/SUBTITLES"   , AV_CODEC_ID_WEBVTT},
+    {"D_WEBVTT/CAPTIONS"    , AV_CODEC_ID_WEBVTT},
+    {"D_WEBVTT/DESCRIPTIONS", AV_CODEC_ID_WEBVTT},
+    {"D_WEBVTT/METADATA"    , AV_CODEC_ID_WEBVTT},
+
    {"V_AV1"            , AV_CODEC_ID_AV1},
    {"V_AVS2"           , AV_CODEC_ID_AVS2},
    {"V_AVS3"           , AV_CODEC_ID_AVS3},
@ -129,9 +130,9 @@ const AVMetadataConv ff_mkv_metadata_conv[] = {

 const char * const ff_matroska_video_stereo_mode[MATROSKA_VIDEO_STEREOMODE_TYPE_NB] = {
    "mono",
-    "left_right",
-    "bottom_top",
-    "top_bottom",
+    "sbs_lr",
+    "tb_rl",
+    "tb_lr",
    "checkerboard_rl",
    "checkerboard_lr",
    "row_interleaved_rl",
@ -139,8 +140,8 @@ const char * const ff_matroska_video_stereo_mode[MATROSKA_VIDEO_STEREOMODE_TYPE_
    "col_interleaved_rl",
    "col_interleaved_lr",
    "anaglyph_cyan_red",
-    "right_left",
+    "sbs_rl",
    "anaglyph_green_magenta",
-    "block_lr",
-    "block_rl",
+    "mvc_lr",
+    "mvc_rl",
 };
--- a/libavformat/matroska.h
+++ b/libavformat/matroska.h
@ -438,4 +438,16 @@ extern const char * const ff_matroska_video_stereo_mode[MATROSKA_VIDEO_STEREOMOD

 #define DVCC_DVVC_BLOCK_TYPE_NAME "Dolby Vision configuration"

+typedef struct AVEdition {
+  int index;
+  int ordered;
+  int64_t duration;
+  const char *title;
+} AVEdition;
+
+int av_mkv_get_num_editions(AVFormatContext *s);
+int av_mkv_get_editions(AVFormatContext *s, AVEdition **editions);
+int av_mkv_set_next_edition(AVFormatContext *s, int index);
+int av_mkv_get_edition(AVFormatContext *s);
+
 #endif /* AVFORMAT_MATROSKA_H */
--- a/libavformat/matroskadec_haali.c
+++ b/libavformat/matroskadec_haali.c
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@ -325,6 +325,17 @@ static int mov_metadata_hmmt(MOVContext *c, AVIOContext *pb, unsigned len)
    return 0;
 }

+static void mov_set_metadata(MOVContext *c, const char *key, const char *str)
+{
+    if (c->trak_index >= 0)
+    {
+        AVStream *st = c->fc->streams[c->fc->nb_streams-1];
+        av_dict_set(&st->metadata, key, str, 0);
+    }
+    else
+        av_dict_set(&c->fc->metadata, key, str, 0);
+}
+
 static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
 {
    char tmp_key[AV_FOURCC_MAX_STRING_SIZE] = {0};
@ -427,6 +438,7 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
    case MKTAG(0xa9,'w','r','n'): key = "warning";   break;
    case MKTAG(0xa9,'w','r','t'): key = "composer";  break;
    case MKTAG(0xa9,'x','y','z'): key = "location";  break;
+    case MKTAG( 'n','a','m','e'): key = "title";     break;
    }
 retry:
    if (c->itunes_metadata && atom.size > 8) {
@ -554,10 +566,10 @@ retry:
            str[str_size] = 0;
        }
        c->fc->event_flags |= AVFMT_EVENT_FLAG_METADATA_UPDATED;
-        av_dict_set(&c->fc->metadata, key, str, 0);
+        mov_set_metadata(c, key, str);
        if (*language && strcmp(language, "und")) {
            snprintf(key2, sizeof(key2), "%s-%s", key, language);
-            av_dict_set(&c->fc->metadata, key2, str, 0);
+            mov_set_metadata(c, key2, str);
        }
        if (!strcmp(key, "encoder")) {
            int major, minor, micro;
@ -5262,7 +5274,7 @@ static int mov_read_tkhd(MOVContext *c, AVIOContext *pb, MOVAtom atom)

    version = avio_r8(pb);
    flags = avio_rb24(pb);
-    st->disposition |= (flags & MOV_TKHD_FLAG_ENABLED) ? AV_DISPOSITION_DEFAULT : 0;
+    /* st->disposition |= (flags & MOV_TKHD_FLAG_ENABLED) ? AV_DISPOSITION_DEFAULT : 0; */

    if (version == 1) {
        avio_rb64(pb);
@ -6138,8 +6150,10 @@ static int mov_read_smdm(MOVContext *c, AVIOContext *pb, MOVAtom atom)
        av_log(c->fc, AV_LOG_WARNING, "Unsupported Mastering Display Metadata box version %d\n", version);
        return 0;
    }
-    if (sc->mastering)
-        return AVERROR_INVALIDDATA;
+    if (sc->mastering) {
+        av_log(c->fc, AV_LOG_WARNING, "Ignoring duplicate Mastering Display Metadata\n");
+        return 0;
+    }

    avio_skip(pb, 3); /* flags */

@ -6176,11 +6190,16 @@ static int mov_read_mdcv(MOVContext *c, AVIOContext *pb, MOVAtom atom)

    sc = c->fc->streams[c->fc->nb_streams - 1]->priv_data;

-    if (atom.size < 24 || sc->mastering) {
+    if (atom.size < 24) {
        av_log(c->fc, AV_LOG_ERROR, "Invalid Mastering Display Color Volume box\n");
        return AVERROR_INVALIDDATA;
    }

+    if (sc->mastering) {
+        av_log(c->fc, AV_LOG_WARNING, "Ignoring duplicate Mastering Display Color Volume\n");
+        return 0;
+    }
+
    sc->mastering = av_mastering_display_metadata_alloc();
    if (!sc->mastering)
        return AVERROR(ENOMEM);
@ -10215,7 +10234,7 @@ static int mov_read_seek(AVFormatContext *s, int stream_index, int64_t sample_ti
                continue;

            timestamp = av_rescale_q(seek_timestamp, s->streams[stream_index]->time_base, st->time_base);
-            sample = mov_seek_stream(s, st, timestamp, flags);
+            sample = mov_seek_stream(s, st, timestamp, flags | AVSEEK_FLAG_ANY);
            if (sample >= 0)
                sti->skip_samples = mov_get_skip_samples(st, sample);
        }
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@ -815,7 +815,7 @@ static const StreamType ISO_types[] = {
 #endif
    { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
    { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
-    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
+    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
    { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
    { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
    { 0x33, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_VVC        },
--- a/libavformat/oggparsevorbis.c
+++ b/libavformat/oggparsevorbis.c
@ -125,14 +125,14 @@ static int vorbis_parse_single_comment(AVFormatContext *as, AVDictionary **m,
            goto end;
        }
        ret = av_base64_decode(pict, v, len);
-        if (ret > 0)
+        if (as && ret > 0)
            ret = ff_flac_parse_picture(as, &pict, ret, 0);
        av_freep(&pict);
        if (ret < 0) {
            av_log(as, AV_LOG_WARNING, "Failed to parse cover art block.\n");
            goto end;
        }
-    } else if (!ogm_chapter(as, t, v)) {
+    } else if (!as || !ogm_chapter(as, t, v)) {
        (*updates)++;
        if (av_dict_get(*m, t, NULL, 0))
            av_dict_set(m, t, ";", AV_DICT_APPEND);
--- a/libavformat/riff.c
+++ b/libavformat/riff.c
@ -506,6 +506,16 @@ const AVCodecTag ff_codec_bmp_tags[] = {
    { AV_CODEC_ID_RTV1,         MKTAG('R', 'T', 'V', '1') },
    { AV_CODEC_ID_VMIX,         MKTAG('V', 'M', 'X', '1') },
    { AV_CODEC_ID_LEAD,         MKTAG('L', 'E', 'A', 'D') },
+
+    { AV_CODEC_ID_HEVC,         MKTAG('H', 'E', 'V', 'C') },
+    { AV_CODEC_ID_HEVC,         MKTAG('H', 'V', 'C', '1') },
+    { AV_CODEC_ID_HEVC,         MKTAG('H', 'M', '1', '0') },
+    { AV_CODEC_ID_PRORES,       MKTAG('a', 'p', 'c', 'h') },
+    { AV_CODEC_ID_PRORES,       MKTAG('a', 'p', 'c', 'n') },
+    { AV_CODEC_ID_PRORES,       MKTAG('a', 'p', 'c', 's') },
+    { AV_CODEC_ID_PRORES,       MKTAG('a', 'p', 'c', 'o') },
+    { AV_CODEC_ID_PRORES,       MKTAG('a', 'p', '4', 'h') },
+
    { AV_CODEC_ID_NONE,         0 }
 };

--- a/libavformat/riffdec.c
+++ b/libavformat/riffdec.c
@ -99,7 +99,8 @@ int ff_get_wav_header(void *logctx, AVIOContext *pb,

    if (size < 14) {
        avpriv_request_sample(logctx, "wav header size < 14");
-        return AVERROR_INVALIDDATA;
+        avio_skip(pb, size);
+        return 0;
    }

    av_channel_layout_uninit(&par->ch_layout);
--- a/libavformat/rtpdec_asf.c
+++ b/libavformat/rtpdec_asf.c
@ -119,7 +119,7 @@ int ff_wms_parse_sdp_a_line(AVFormatContext *s, const char *p)
            avformat_close_input(&rt->asf_ctx);
        }

-        if (!(iformat = av_find_input_format("asf")))
+        if (!(iformat = av_find_input_format("asf")) && !(iformat = av_find_input_format("asf_o")))
            return AVERROR_DEMUXER_NOT_FOUND;

        rt->asf_ctx = avformat_alloc_context();
--- a/libavformat/spdifenc.c
+++ b/libavformat/spdifenc.c
@ -685,7 +685,7 @@ const FFOutputFormat ff_spdif_muxer = {
    .write_header      = spdif_write_header,
    .write_packet      = spdif_write_packet,
    .deinit            = spdif_deinit,
-    .p.flags           = AVFMT_NOTIMESTAMPS,
+    .p.flags           = AVFMT_NOTIMESTAMPS | AVFMT_NOFILE,
    .p.priv_class      = &spdif_class,
    .flags_internal    = FF_OFMT_FLAG_MAX_ONE_OF_EACH,
 };
--- a/libavformat/udp.c
+++ b/libavformat/udp.c
@ -1118,6 +1118,7 @@ static int udp_close(URLContext *h)
    if (s->is_multicast && (h->flags & AVIO_FLAG_READ))
        udp_leave_multicast_group(s->udp_fd, (struct sockaddr *)&s->dest_addr,
                                  (struct sockaddr *)&s->local_addr_storage, h);
+    closesocket(s->udp_fd);
 #if HAVE_PTHREAD_CANCEL
    if (s->thread_started) {
        int ret;
@ -1140,7 +1141,6 @@ static int udp_close(URLContext *h)
        pthread_cond_destroy(&s->cond);
    }
 #endif
-    closesocket(s->udp_fd);
    av_fifo_freep2(&s->fifo);
    ff_ip_reset_filters(&s->filters);
    return 0;
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@ -603,3 +603,87 @@ int ff_bprint_to_codecpar_extradata(AVCodecParameters *par, struct AVBPrint *buf
    par->extradata_size = buf->len;
    return 0;
 }
+
+enum AVStreamParseType av_lav_stream_parser_get_needed(const AVStream *st)
+{
+    if (!st)
+        return 0;
+    return cffstream(st)->need_parsing;
+}
+
+void av_lav_stream_parser_set_needed(AVStream *st, enum AVStreamParseType needed)
+{
+    if (!st)
+        return;
+    ffstream(st)->need_parsing = needed;
+}
+
+void av_lav_stream_parser_init(AVStream *st)
+{
+    FFStream *sti;
+    if (!st)
+        return;
+
+    sti = ffstream(st);
+    if (sti->parser)
+        return;
+
+    sti->parser = av_parser_init(st->codecpar->codec_id);
+    if (sti->parser)
+    {
+        if (sti->need_parsing == AVSTREAM_PARSE_HEADERS)
+        {
+            sti->parser->flags |= PARSER_FLAG_COMPLETE_FRAMES;
+        }
+        else if (sti->need_parsing == AVSTREAM_PARSE_FULL_ONCE)
+        {
+            sti->parser->flags |= PARSER_FLAG_ONCE;
+        }
+    }
+}
+
+int av_lav_stream_parser_get_flags(const AVStream *st)
+{
+    const FFStream *sti;
+    if (!st)
+        return 0;
+
+    sti = cffstream(st);
+    if (sti->parser)
+        return sti->parser->flags;
+
+    return 0;
+}
+
+void av_lav_stream_parser_update_flags(AVStream *st, int flags)
+{
+    FFStream *sti;
+    if (!st)
+        return;
+
+    sti = ffstream(st);
+    if (sti->parser)
+        sti->parser->flags = flags;
+}
+
+int av_lav_stream_codec_info_nb_frames(const AVStream *st)
+{
+    if (!st)
+        return 0;
+    return cffstream(st)->codec_info_nb_frames;
+}
+
+int av_lav_stream_get_timing_info(const AVStream *st, AVRational *tb, int *ticks_per_frame)
+{
+    const FFStream *sti;
+    if (!st)
+        return -1;
+
+    sti = cffstream(st);
+    if (tb)
+        *tb = sti->avctx->time_base;
+    if (ticks_per_frame)
+        *ticks_per_frame = sti->avctx->ticks_per_frame;
+
+    return 0;
+}
--- a/libavutil/avutil.h
+++ b/libavutil/avutil.h
@ -245,7 +245,7 @@ const char *av_get_media_type_string(enum AVMediaType media_type);
 * either pts or dts.
 */

-#define AV_NOPTS_VALUE          ((int64_t)UINT64_C(0x8000000000000000))
+#define AV_NOPTS_VALUE          INT64_C(0x8000000000000000)

 /**
 * Internal time base represented as integer
--- a/libavutil/mastering_display_metadata.h
+++ b/libavutil/mastering_display_metadata.h
@ -56,6 +56,30 @@ typedef struct AVMasteringDisplayMetadata {
     */
    AVRational max_luminance;

+    /**
+     * MPEG vs JPEG YUV range.
+     * It must be accessed using av_frame_get_color_range() and
+     * av_frame_set_color_range().
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVColorRange color_range;
+
+    enum AVColorPrimaries color_primaries;
+
+    enum AVColorTransferCharacteristic color_trc;
+
+    /**
+     * YUV colorspace type.
+     * It must be accessed using av_frame_get_colorspace() and
+     * av_frame_set_colorspace().
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVColorSpace colorspace;
+
+    enum AVChromaLocation chroma_location;
+
    /**
     * Flag indicating whether the display primaries (and white point) are set.
     */
@ -66,6 +90,10 @@ typedef struct AVMasteringDisplayMetadata {
     */
    int has_luminance;

+    /**
+     * Flag indicating whether the color_range, color_primaries, color_trc, colorspace and chroma_location have been set
+     */
+    int has_colorspace;
 } AVMasteringDisplayMetadata;

 /**
--- a/libswresample/options.c
+++ b/libswresample/options.c
@ -122,6 +122,7 @@ static const AVOption options[]={
 { "kaiser_beta"         , "set swr Kaiser window beta"  , OFFSET(kaiser_beta)    , AV_OPT_TYPE_DOUBLE  , {.dbl=9                     }, 2      , 16        , PARAM },

 { "output_sample_bits"  , "set swr number of output sample bits", OFFSET(dither.output_sample_bits), AV_OPT_TYPE_INT  , {.i64=0   }, 0      , 64        , PARAM },
+{ "clip_protection"     , "Clipping Protection"         , OFFSET(clip_protection), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, PARAM },
 {0}
 };

--- a/libswresample/rematrix.c
+++ b/libswresample/rematrix.c
@ -424,7 +424,7 @@ av_cold static int auto_matrix(SwrContext *s)

    if (s->rematrix_maxval > 0) {
        maxval = s->rematrix_maxval;
-    } else if (   av_get_packed_sample_fmt(s->out_sample_fmt) < AV_SAMPLE_FMT_FLT
+    } else if (  (av_get_packed_sample_fmt(s->out_sample_fmt) < AV_SAMPLE_FMT_FLT && !s->clip_protection)
               || av_get_packed_sample_fmt(s->int_sample_fmt) < AV_SAMPLE_FMT_FLT) {
        maxval = 1.0;
    } else
@ -452,6 +452,7 @@ av_cold int swri_rematrix_init(SwrContext *s){
    int nb_out = s->out.ch_count;

    s->mix_any_f = NULL;
+    s->clip_max = 1.0f;

    if (!s->rematrix_custom) {
        int r = auto_matrix(s);
@ -566,7 +567,7 @@ int swri_rematrix(SwrContext *s, AudioData *out, AudioData *in, int len, int mus

    if(s->mix_any_f) {
        s->mix_any_f(out->ch, (const uint8_t **)in->ch, s->native_matrix, len);
-        return 0;
+        goto clip_protection;
    }

    if(s->mix_2_1_simd || s->mix_1_1_simd){
@ -637,5 +638,22 @@ int swri_rematrix(SwrContext *s, AudioData *out, AudioData *in, int len, int mus
            }
        }
    }
+
+clip_protection:
+    if (s->clip_protection && s->int_sample_fmt == AV_SAMPLE_FMT_FLTP)
+    {
+        for(j = 0; j < out->ch_count; j++) {
+            for(i = 0; i < len; i++) {
+                const float sample = fabsf(((float *)out->ch[j])[i]);
+                if (sample > s->clip_max) {
+                    s->clip_max = sample;
+                    av_log(s, AV_LOG_INFO, "Clipping protection at %.3f\n", sample);
+                }
+                if (s->clip_max > 1.0f)
+                    ((float *)out->ch[j])[i] /= s->clip_max;
+            }
+        }
+    }
+
    return 0;
 }
--- a/libswresample/swresample_internal.h
+++ b/libswresample/swresample_internal.h
@ -183,6 +183,9 @@ struct SwrContext {
    mix_any_func_type *mix_any_f;

    /* TODO: callbacks for ASM optimizations */
+
+    int clip_protection;
+    float clip_max;
 };

 av_warn_unused_result
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@ -2510,16 +2510,30 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context, int srcW,
    int64_t src_h_chr_pos = -513, dst_h_chr_pos = -513,
            src_v_chr_pos = -513, dst_v_chr_pos = -513;

+    enum AVPixelFormat srcFormatHandled = srcFormat, dstFormatHandled = dstFormat;
+    int src_range  = handle_jpeg(&srcFormatHandled);
+    int src_xyz    = handle_xyz(&srcFormatHandled);
+    int src_0alpha = handle_0alpha(&srcFormatHandled);
+    int dst_range  = handle_jpeg(&dstFormatHandled);
+    int dst_xyz    = handle_xyz(&dstFormatHandled);
+    int dst_0alpha = handle_0alpha(&dstFormatHandled);
+
    if (!param)
        param = default_param;

    if (context &&
        (context->srcW      != srcW      ||
         context->srcH      != srcH      ||
-         context->srcFormat != srcFormat ||
+         context->srcFormat != srcFormatHandled ||
+         context->srcRange  != src_range ||
+         context->srcXYZ    != src_xyz   ||
+         context->src0Alpha != src_0alpha ||
         context->dstW      != dstW      ||
         context->dstH      != dstH      ||
-         context->dstFormat != dstFormat ||
+         context->dstFormat != dstFormatHandled ||
+         context->dstRange  != dst_range ||
+         context->dstXYZ    != dst_xyz   ||
+         context->dst0Alpha != dst_0alpha ||
         context->flags     != flags     ||
         context->param[0]  != param[0]  ||
         context->param[1]  != param[1])) {