When drawing transparent triangle lists, you often run into ordering problems where triangles within the list itself have draw order issues. From some directions, the alpha-blended mesh looks just fine; from others, you get an inverted geometry effect. This is because you can't use the z buffer for blended geometry. Instead, you must sort your triangles in "painter's algorithm" order, which means the yonmost triangles first.

Sorting all triangles for each frame is not realistic. And it might seem that you can't pre-sort triangles in a mesh, because the mesh could be drawn from any angle. However, it turns out that you can generate a partial ordering of the triangles, such that, more often than not, the mesh will draw the yon-most triangles first, from any view direction. This works well for convex meshes, but even works for many concave meshes, such as a "T" shape. Meshes where the ordering contains cycles, though (such as a "+" shape) will not be 100% correct when only using a static sort. However, the problem is so much improved, that static sorting still is a good idea, and often enough of a solution.

The trick is to split triangles that are considered two-sided, into two separate triangles, each of which is single-sided. These two triangles can then draw in totally different order within the mesh. With culling turned on, the mesh can then be drawn from any angle (or almost any angle) with minimal or no artifacts because of internal blending order.

I implemented code that can sort a ID3DXMesh interface based on this kind of ordering. The code is quickly written (uses globals) and takes a while to run (N-squared in both memory and runtime), but should be of use to tool developers who haven't yet put this technique into their exporter/art pipe.

struct FaceRef {
  float fscore;
  unsigned short face;
};
struct Face {
  unsigned short ix[3];       //  the face
  unsigned short nfront;      //  how many faces I am in front of
  FaceRef *infront;           //  refs to faces I am in front of, with scores
  float fnorm[3];             //  the face normal
  float fdist;                //  the face distance from origin in the direction of the normal
  float score;                //  the current cost of this face (sum of other faces behind)
  float area;
};
Face *faces = 0;
FaceRef *frefs = 0;
size_t frtop = 0;
void *vbase = 0;
size_t vstride = 0;

static void ClassifyAndScoreFaces(size_t totFaces)
{
  for (size_t g = 0; g < totFaces; ++g) {
    if (!(g % 100)) {
      wchar_t str[100];
      swprintf(str, L"Face %d of %d\n", g, totFaces);
      ::OutputDebugString(str);
    }
    assert(faces[g].ix[0] != faces[g].ix[1]);
    faces[g].infront = frefs+frtop;
    for (size_t f = 0; f < totFaces; ++f) {
      if (f != g) {
        //  classify all verts of g based on plane of f, 
        //  and at the same time f based on g
        int vfront = 0, gfront = 0;
        for (int i = 0; i < 3; ++i) {
          D3DXVECTOR3 v = *(D3DXVECTOR3 *)((char *)vbase + vstride * faces[g].ix[i]);
          float d = D3DXVec3Dot(&v, (D3DXVECTOR3 *)faces[f].fnorm) - faces[f].fdist;
          //  this epsilon should be negative
          if (d < -1e-4) {
            vfront++;
          }
          v = *(D3DXVECTOR3 *)((char *)vbase + vstride * faces[f].ix[i]);
          d = D3DXVec3Dot(&v, (D3DXVECTOR3 *)faces[g].fnorm) - faces[g].fdist;
          //  this epsilon should be positive
          if (d < 1e-4) {
            gfront++;
          }
        }
        if (vfront && (gfront < 3)) {
          //  I'm at least partially in front of this triangle
          //  and not entirely culled by him.
          FaceRef &fr = faces[g].infront[faces[g].nfront];
          //  g has f in front of him/her
          fr.face = (unsigned short)f;
          //  How much area can be covered by this in-front-ness, and 
          //  what angle is it at? Note that many small triangles covering one 
          //  big triangle shouldn't allow all the small ones to each cover the 
          //  big, therefore use the largest area of the two of the pair.
          fr.fscore = (float)(std::max(faces[f].area, faces[g].area)
              * (1 + D3DXVec3Dot((D3DXVECTOR3 *)faces[f].fnorm, (D3DXVECTOR3 *)faces[g].fnorm)) * 0.5 * vfront / 3);
          faces[g].nfront++;

          //  the score/cost of f is incremented by g's reference
          faces[f].score += fr.fscore;
          ++frtop;
        }
      }
    }
    if (faces[g].nfront == 0) {
      wchar_t str[100];
      swprintf(str, L"Face %d is behind every other face at the beginning.\n", g);
      ::OutputDebugString(str);
    }
  }
}


static void DuplicateAndReorderForTransparency(
  void const *ptrFrom, size_t stride, size_t numVertices,
  void const *ixFrom, size_t numFaces,
  void *ptrTo,
  void *ixTo,
  size_t normal)
{
  size_t swapOffset = 0;
  memcpy(ptrTo, ptrFrom, stride*numVertices);
  if (normal) {
    // duplicate triangles, facing the other way
    swapOffset = numVertices;
    memcpy((char *)ptrTo + stride*numVertices, ptrFrom, stride*numVertices);
    for (size_t i = 0; i < numVertices; ++i) {
      float *fp = (float *)((char *)ptrTo + stride*(swapOffset + i) + normal);
      fp[0] *= -1;
      fp[1] *= -1;
      fp[2] *= -1;
    }
  }
  assert(_CrtCheckMemory());

  size_t nuNumFaces = numFaces * (swapOffset ? 2 : 1);
  faces = new Face[nuNumFaces];
  memset(faces, 0, sizeof(Face)*(nuNumFaces));
  frefs = new FaceRef[nuNumFaces*nuNumFaces];
  memset(frefs, 0, sizeof(FaceRef)*nuNumFaces*nuNumFaces);
  frtop = 0;
  vbase = ptrTo;
  vstride = stride;
  unsigned short const *ixSrc = (unsigned short const *)ixFrom;
  for (size_t f = 0; f < numFaces; ++f) {
    //  wind the triangle
    faces[f].ix[0] = ixSrc[f*3+0];
    faces[f].ix[1] = ixSrc[f*3+1];
    faces[f].ix[2] = ixSrc[f*3+2];
    //  delay in-front calculation, but allocate worst-case memory
    faces[f].nfront = 0;
    faces[f].infront = 0;
    //  cross product to create fnorm
    D3DXVECTOR3 a = *(D3DXVECTOR3 *)((char *)ptrFrom + stride * ixSrc[f*3+0]);
    D3DXVECTOR3 b = *(D3DXVECTOR3 *)((char *)ptrFrom + stride * ixSrc[f*3+1]);
    D3DXVECTOR3 c = *(D3DXVECTOR3 *)((char *)ptrFrom + stride * ixSrc[f*3+2]);
    b = b - a;
    c = c - a;
    D3DXVec3Cross(&a, &b, &c);
    faces[f].area = D3DXVec3Length(&a) * 0.5f;
    D3DXVec3Normalize((D3DXVECTOR3 *)faces[f].fnorm, &a);
    //  dot product to calculate fdist
    faces[f].fdist = D3DXVec3Dot(
        (D3DXVECTOR3 *)((char *)ptrFrom + stride * ixSrc[f*3+0]), (D3DXVECTOR3 *)faces[f].fnorm);
    //  delay score calculation
    faces[f].score = 0;
  }
  assert(_CrtCheckMemory());

  if (swapOffset) {
    memcpy(faces + numFaces, faces, sizeof(Face) * numFaces);
    for (size_t f = numFaces; f < nuNumFaces; ++f) {
      //  wind the triangle the other way
      unsigned short s = faces[f].ix[0];
      faces[f].ix[0] = (unsigned short)(faces[f].ix[1] + swapOffset);
      faces[f].ix[1] = (unsigned short)(s + swapOffset);
      faces[f].ix[2] = (unsigned short)(faces[f].ix[2] + swapOffset);
      //  delay in-front calculation, but allocate worst-case memory
      faces[f].nfront = 0;
      faces[f].infront = 0;
      //  flip face normal
      faces[f].fnorm[0] *= -1;
      faces[f].fnorm[1] *= -1;
      faces[f].fnorm[2] *= -1;
      //  The distance is (by definition) the negative
      faces[f].fdist *= -1;
      //  delay score calculation
      faces[f].score = 0;
    }
  }
  assert(_CrtCheckMemory());

  ClassifyAndScoreFaces(nuNumFaces);
  assert(_CrtCheckMemory());

  unsigned short *ixDst = (unsigned short *)ixTo;

#if 0
    for (size_t i = 0; i < nuNumFaces; ++i) {
    ixDst[0] = faces[i].ix[0];
    ixDst[1] = faces[i].ix[1];
    ixDst[2] = faces[i].ix[2];
    ixDst += 3;
  }
#else
  //  Repeat picking off the lowest-scoring face and putting it in the buffer
  for (size_t i = 0; i < nuNumFaces; ++i) {
    //  I don't want to re-order the buffer, because face references are by 
    //  index. Thus, I get a fully N-squared-over-two search here. At least 
    //  it will cache well, because it's a linear array.
    float lowest = faces[i].score;
    size_t lowestIx = i;
    for (size_t f = 0; f < nuNumFaces; ++f) {
      if (faces[f].score < lowest) {
        lowest = faces[f].score;
        lowestIx = f;
      }
    }
    assert(lowest < 1e20f);
    assert(faces[lowestIx].ix[0] != faces[lowestIx].ix[1]);
    assert(faces[lowestIx].score == lowest);
    if (lowest > 0) {
      wchar_t str[100];
      swprintf(str, L"Picking face %d with score %.3f > 0.\n", lowestIx, lowest);
      ::OutputDebugString(str);
    }
    ixDst[0] = faces[lowestIx].ix[0];
    ixDst[1] = faces[lowestIx].ix[1];
    ixDst[2] = faces[lowestIx].ix[2];
    ixDst += 3;
    faces[lowestIx].score = 1e20f;
    //  remove the score from the referenced faces
    for (size_t q = 0; q < faces[lowestIx].nfront; ++q) {
      faces[faces[lowestIx].infront[q].face].score -= faces[lowestIx].infront[q].fscore;
    }

    if (!(i % 100)) {
      wchar_t str[100];
      swprintf(str, L"Generated %d faces.\n", i);
      ::OutputDebugString(str);
    }
  }
#endif
  assert(_CrtCheckMemory());
  assert(((unsigned short *)ixDst - (unsigned short *)ixTo)/3 == (long int)nuNumFaces);

  delete[] faces;
  delete[] frefs;
  faces = 0;
  frefs = 0;
}


// From is a loaded mesh
//
// To is a created mesh, which has enough space for the sorted, 
// duplicated vertices/triangles. If the mesh has normals, this 
// is exactly twice the size of the input mesh; else it's the 
// same as the input mesh.
//
static void SortMeshTriangles(ID3DXMesh *from, ID3DXMesh **to)
{
  HRESULT hr;

  CComPtr<IDirect3DDevice9> dev;
  hr = from->GetDevice(&dev);
  CComPtr<IDirect3DVertexBuffer9> vbFrom;
  hr = from->GetVertexBuffer(&vbFrom);
  CComPtr<IDirect3DIndexBuffer9> ibFrom;
  hr = from->GetIndexBuffer(&ibFrom);
  DWORD stride = from->GetNumBytesPerVertex();
  DWORD numFaces = from->GetNumFaces();
  DWORD numVertices = from->GetNumVertices();
  DWORD options = from->GetOptions();
  D3DVERTEXELEMENT9 decl[32];
  memset(decl, 0, sizeof(decl));
  hr = from->GetDeclaration(decl);

  hr = D3DXCreateMesh(numFaces * 2, numVertices * 2, options, decl, dev, to);
  CComPtr<IDirect3DVertexBuffer9> vbTo;
  hr = (*to)->GetVertexBuffer(&vbTo);
  CComPtr<IDirect3DIndexBuffer9> ibTo;
  hr = (*to)->GetIndexBuffer(&ibTo);
  void *ptrFrom;
  hr = vbFrom->Lock(0, 0, &ptrFrom, D3DLOCK_READONLY);
  void *ptrTo;
  hr = vbTo->Lock(0, 0, &ptrTo, 0);
  void *ixFrom;
  hr = ibFrom->Lock(0, 0, &ixFrom, D3DLOCK_READONLY);
  void *ixTo;
  hr = ibTo->Lock(0, 0, &ixTo, 0);

  DWORD normal = 0;
  for (int i = 0; i < 32; ++i) {
    if (decl[i].Usage == D3DDECLUSAGE_NORMAL) {
      normal = decl[i].Offset;
      break;
    }
  }

  DuplicateAndReorderForTransparency(
      ptrFrom, stride, numVertices,
      ixFrom, numFaces,
      ptrTo,
      ixTo,
      normal);
  vbFrom->Unlock();
  vbTo->Unlock();
  ibFrom->Unlock();
  ibTo->Unlock();
}